xiaoanyu123 commited on
Commit
34bcae4
·
verified ·
1 Parent(s): 8f3b606

Add files using upload-large-folder tool

Browse files
Files changed (20) hide show
  1. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogvideo/__pycache__/__init__.cpython-310.pyc +0 -0
  2. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogvideo/__pycache__/pipeline_cogvideox_video2video.cpython-310.pyc +0 -0
  3. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogvideo/__pycache__/pipeline_output.cpython-310.pyc +0 -0
  4. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/__init__.py +47 -0
  5. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/__pycache__/__init__.cpython-310.pyc +0 -0
  6. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/__pycache__/pipeline_cogview3plus.cpython-310.pyc +0 -0
  7. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/__pycache__/pipeline_output.cpython-310.pyc +0 -0
  8. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/pipeline_cogview3plus.py +682 -0
  9. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/pipeline_output.py +21 -0
  10. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/__init__.py +49 -0
  11. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/__pycache__/__init__.cpython-310.pyc +0 -0
  12. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/__pycache__/pipeline_cogview4.cpython-310.pyc +0 -0
  13. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/__pycache__/pipeline_cogview4_control.cpython-310.pyc +0 -0
  14. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/__pycache__/pipeline_output.cpython-310.pyc +0 -0
  15. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/pipeline_cogview4.py +685 -0
  16. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/pipeline_cogview4_control.py +732 -0
  17. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/pipeline_output.py +21 -0
  18. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/consisid/consisid_utils.py +357 -0
  19. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/consisid/pipeline_consisid.py +974 -0
  20. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/consisid/pipeline_output.py +20 -0
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogvideo/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.35 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogvideo/__pycache__/pipeline_cogvideox_video2video.cpython-310.pyc ADDED
Binary file (28.7 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogvideo/__pycache__/pipeline_output.cpython-310.pyc ADDED
Binary file (988 Bytes). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/__init__.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_torch_available,
9
+ is_transformers_available,
10
+ )
11
+
12
+
13
+ _dummy_objects = {}
14
+ _additional_imports = {}
15
+ _import_structure = {"pipeline_output": ["CogView3PlusPipelineOutput"]}
16
+
17
+ try:
18
+ if not (is_transformers_available() and is_torch_available()):
19
+ raise OptionalDependencyNotAvailable()
20
+ except OptionalDependencyNotAvailable:
21
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
22
+
23
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
+ else:
25
+ _import_structure["pipeline_cogview3plus"] = ["CogView3PlusPipeline"]
26
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
27
+ try:
28
+ if not (is_transformers_available() and is_torch_available()):
29
+ raise OptionalDependencyNotAvailable()
30
+ except OptionalDependencyNotAvailable:
31
+ from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
32
+ else:
33
+ from .pipeline_cogview3plus import CogView3PlusPipeline
34
+ else:
35
+ import sys
36
+
37
+ sys.modules[__name__] = _LazyModule(
38
+ __name__,
39
+ globals()["__file__"],
40
+ _import_structure,
41
+ module_spec=__spec__,
42
+ )
43
+
44
+ for name, value in _dummy_objects.items():
45
+ setattr(sys.modules[__name__], name, value)
46
+ for name, value in _additional_imports.items():
47
+ setattr(sys.modules[__name__], name, value)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.16 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/__pycache__/pipeline_cogview3plus.cpython-310.pyc ADDED
Binary file (23.3 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/__pycache__/pipeline_output.cpython-310.pyc ADDED
Binary file (1.01 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/pipeline_cogview3plus.py ADDED
@@ -0,0 +1,682 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import inspect
17
+ from typing import Callable, Dict, List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ from transformers import T5EncoderModel, T5Tokenizer
21
+
22
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
23
+ from ...image_processor import VaeImageProcessor
24
+ from ...models import AutoencoderKL, CogView3PlusTransformer2DModel
25
+ from ...pipelines.pipeline_utils import DiffusionPipeline
26
+ from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
27
+ from ...utils import is_torch_xla_available, logging, replace_example_docstring
28
+ from ...utils.torch_utils import randn_tensor
29
+ from .pipeline_output import CogView3PipelineOutput
30
+
31
+
32
+ if is_torch_xla_available():
33
+ import torch_xla.core.xla_model as xm
34
+
35
+ XLA_AVAILABLE = True
36
+ else:
37
+ XLA_AVAILABLE = False
38
+
39
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
40
+
41
+
42
+ EXAMPLE_DOC_STRING = """
43
+ Examples:
44
+ ```python
45
+ >>> import torch
46
+ >>> from diffusers import CogView3PlusPipeline
47
+
48
+ >>> pipe = CogView3PlusPipeline.from_pretrained("THUDM/CogView3-Plus-3B", torch_dtype=torch.bfloat16)
49
+ >>> pipe.to("cuda")
50
+
51
+ >>> prompt = "A photo of an astronaut riding a horse on mars"
52
+ >>> image = pipe(prompt).images[0]
53
+ >>> image.save("output.png")
54
+ ```
55
+ """
56
+
57
+
58
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
59
+ def retrieve_timesteps(
60
+ scheduler,
61
+ num_inference_steps: Optional[int] = None,
62
+ device: Optional[Union[str, torch.device]] = None,
63
+ timesteps: Optional[List[int]] = None,
64
+ sigmas: Optional[List[float]] = None,
65
+ **kwargs,
66
+ ):
67
+ r"""
68
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
69
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
70
+
71
+ Args:
72
+ scheduler (`SchedulerMixin`):
73
+ The scheduler to get timesteps from.
74
+ num_inference_steps (`int`):
75
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
76
+ must be `None`.
77
+ device (`str` or `torch.device`, *optional*):
78
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
79
+ timesteps (`List[int]`, *optional*):
80
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
81
+ `num_inference_steps` and `sigmas` must be `None`.
82
+ sigmas (`List[float]`, *optional*):
83
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
84
+ `num_inference_steps` and `timesteps` must be `None`.
85
+
86
+ Returns:
87
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
88
+ second element is the number of inference steps.
89
+ """
90
+ if timesteps is not None and sigmas is not None:
91
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
92
+ if timesteps is not None:
93
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
94
+ if not accepts_timesteps:
95
+ raise ValueError(
96
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
97
+ f" timestep schedules. Please check whether you are using the correct scheduler."
98
+ )
99
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
100
+ timesteps = scheduler.timesteps
101
+ num_inference_steps = len(timesteps)
102
+ elif sigmas is not None:
103
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
104
+ if not accept_sigmas:
105
+ raise ValueError(
106
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
107
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
108
+ )
109
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
110
+ timesteps = scheduler.timesteps
111
+ num_inference_steps = len(timesteps)
112
+ else:
113
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
114
+ timesteps = scheduler.timesteps
115
+ return timesteps, num_inference_steps
116
+
117
+
118
+ class CogView3PlusPipeline(DiffusionPipeline):
119
+ r"""
120
+ Pipeline for text-to-image generation using CogView3Plus.
121
+
122
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
123
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
124
+
125
+ Args:
126
+ vae ([`AutoencoderKL`]):
127
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
128
+ text_encoder ([`T5EncoderModel`]):
129
+ Frozen text-encoder. CogView3Plus uses
130
+ [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
131
+ [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
132
+ tokenizer (`T5Tokenizer`):
133
+ Tokenizer of class
134
+ [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
135
+ transformer ([`CogView3PlusTransformer2DModel`]):
136
+ A text conditioned `CogView3PlusTransformer2DModel` to denoise the encoded image latents.
137
+ scheduler ([`SchedulerMixin`]):
138
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
139
+ """
140
+
141
+ _optional_components = []
142
+ model_cpu_offload_seq = "text_encoder->transformer->vae"
143
+
144
+ _callback_tensor_inputs = [
145
+ "latents",
146
+ "prompt_embeds",
147
+ "negative_prompt_embeds",
148
+ ]
149
+
150
+ def __init__(
151
+ self,
152
+ tokenizer: T5Tokenizer,
153
+ text_encoder: T5EncoderModel,
154
+ vae: AutoencoderKL,
155
+ transformer: CogView3PlusTransformer2DModel,
156
+ scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
157
+ ):
158
+ super().__init__()
159
+
160
+ self.register_modules(
161
+ tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
162
+ )
163
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
164
+
165
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
166
+
167
+ # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds with num_videos_per_prompt->num_images_per_prompt
168
+ def _get_t5_prompt_embeds(
169
+ self,
170
+ prompt: Union[str, List[str]] = None,
171
+ num_images_per_prompt: int = 1,
172
+ max_sequence_length: int = 226,
173
+ device: Optional[torch.device] = None,
174
+ dtype: Optional[torch.dtype] = None,
175
+ ):
176
+ device = device or self._execution_device
177
+ dtype = dtype or self.text_encoder.dtype
178
+
179
+ prompt = [prompt] if isinstance(prompt, str) else prompt
180
+ batch_size = len(prompt)
181
+
182
+ text_inputs = self.tokenizer(
183
+ prompt,
184
+ padding="max_length",
185
+ max_length=max_sequence_length,
186
+ truncation=True,
187
+ add_special_tokens=True,
188
+ return_tensors="pt",
189
+ )
190
+ text_input_ids = text_inputs.input_ids
191
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
192
+
193
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
194
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
195
+ logger.warning(
196
+ "The following part of your input was truncated because `max_sequence_length` is set to "
197
+ f" {max_sequence_length} tokens: {removed_text}"
198
+ )
199
+
200
+ prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
201
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
202
+
203
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
204
+ _, seq_len, _ = prompt_embeds.shape
205
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
206
+ prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
207
+
208
+ return prompt_embeds
209
+
210
+ def encode_prompt(
211
+ self,
212
+ prompt: Union[str, List[str]],
213
+ negative_prompt: Optional[Union[str, List[str]]] = None,
214
+ do_classifier_free_guidance: bool = True,
215
+ num_images_per_prompt: int = 1,
216
+ prompt_embeds: Optional[torch.Tensor] = None,
217
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
218
+ max_sequence_length: int = 224,
219
+ device: Optional[torch.device] = None,
220
+ dtype: Optional[torch.dtype] = None,
221
+ ):
222
+ r"""
223
+ Encodes the prompt into text encoder hidden states.
224
+
225
+ Args:
226
+ prompt (`str` or `List[str]`, *optional*):
227
+ prompt to be encoded
228
+ negative_prompt (`str` or `List[str]`, *optional*):
229
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
230
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
231
+ less than `1`).
232
+ do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
233
+ Whether to use classifier free guidance or not.
234
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
235
+ Number of images that should be generated per prompt. torch device to place the resulting embeddings on
236
+ prompt_embeds (`torch.Tensor`, *optional*):
237
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
238
+ provided, text embeddings will be generated from `prompt` input argument.
239
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
240
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
241
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
242
+ argument.
243
+ max_sequence_length (`int`, defaults to `224`):
244
+ Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
245
+ device: (`torch.device`, *optional*):
246
+ torch device
247
+ dtype: (`torch.dtype`, *optional*):
248
+ torch dtype
249
+ """
250
+ device = device or self._execution_device
251
+
252
+ prompt = [prompt] if isinstance(prompt, str) else prompt
253
+ if prompt is not None:
254
+ batch_size = len(prompt)
255
+ else:
256
+ batch_size = prompt_embeds.shape[0]
257
+
258
+ if prompt_embeds is None:
259
+ prompt_embeds = self._get_t5_prompt_embeds(
260
+ prompt=prompt,
261
+ num_images_per_prompt=num_images_per_prompt,
262
+ max_sequence_length=max_sequence_length,
263
+ device=device,
264
+ dtype=dtype,
265
+ )
266
+
267
+ if do_classifier_free_guidance and negative_prompt is None:
268
+ negative_prompt_embeds = prompt_embeds.new_zeros(prompt_embeds.shape)
269
+
270
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
271
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
272
+
273
+ if prompt is not None and type(prompt) is not type(negative_prompt):
274
+ raise TypeError(
275
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
276
+ f" {type(prompt)}."
277
+ )
278
+ elif batch_size != len(negative_prompt):
279
+ raise ValueError(
280
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
281
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
282
+ " the batch size of `prompt`."
283
+ )
284
+
285
+ negative_prompt_embeds = self._get_t5_prompt_embeds(
286
+ prompt=negative_prompt,
287
+ num_images_per_prompt=num_images_per_prompt,
288
+ max_sequence_length=max_sequence_length,
289
+ device=device,
290
+ dtype=dtype,
291
+ )
292
+
293
+ return prompt_embeds, negative_prompt_embeds
294
+
295
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
296
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
297
+ shape = (
298
+ batch_size,
299
+ num_channels_latents,
300
+ int(height) // self.vae_scale_factor,
301
+ int(width) // self.vae_scale_factor,
302
+ )
303
+ if isinstance(generator, list) and len(generator) != batch_size:
304
+ raise ValueError(
305
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
306
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
307
+ )
308
+
309
+ if latents is None:
310
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
311
+ else:
312
+ latents = latents.to(device)
313
+
314
+ # scale the initial noise by the standard deviation required by the scheduler
315
+ latents = latents * self.scheduler.init_noise_sigma
316
+ return latents
317
+
318
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
319
+ def prepare_extra_step_kwargs(self, generator, eta):
320
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
321
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
322
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
323
+ # and should be between [0, 1]
324
+
325
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
326
+ extra_step_kwargs = {}
327
+ if accepts_eta:
328
+ extra_step_kwargs["eta"] = eta
329
+
330
+ # check if the scheduler accepts generator
331
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
332
+ if accepts_generator:
333
+ extra_step_kwargs["generator"] = generator
334
+ return extra_step_kwargs
335
+
336
+ # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
337
+ def check_inputs(
338
+ self,
339
+ prompt,
340
+ height,
341
+ width,
342
+ negative_prompt,
343
+ callback_on_step_end_tensor_inputs,
344
+ prompt_embeds=None,
345
+ negative_prompt_embeds=None,
346
+ ):
347
+ if height % 8 != 0 or width % 8 != 0:
348
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
349
+
350
+ if callback_on_step_end_tensor_inputs is not None and not all(
351
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
352
+ ):
353
+ raise ValueError(
354
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
355
+ )
356
+ if prompt is not None and prompt_embeds is not None:
357
+ raise ValueError(
358
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
359
+ " only forward one of the two."
360
+ )
361
+ elif prompt is None and prompt_embeds is None:
362
+ raise ValueError(
363
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
364
+ )
365
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
366
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
367
+
368
+ if prompt is not None and negative_prompt_embeds is not None:
369
+ raise ValueError(
370
+ f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
371
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
372
+ )
373
+
374
+ if negative_prompt is not None and negative_prompt_embeds is not None:
375
+ raise ValueError(
376
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
377
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
378
+ )
379
+
380
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
381
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
382
+ raise ValueError(
383
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
384
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
385
+ f" {negative_prompt_embeds.shape}."
386
+ )
387
+
388
+ @property
389
+ def guidance_scale(self):
390
+ return self._guidance_scale
391
+
392
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
393
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
394
+ # corresponds to doing no classifier free guidance.
395
+ @property
396
+ def do_classifier_free_guidance(self):
397
+ return self._guidance_scale > 1
398
+
399
+ @property
400
+ def num_timesteps(self):
401
+ return self._num_timesteps
402
+
403
+ @property
404
+ def interrupt(self):
405
+ return self._interrupt
406
+
407
+ @torch.no_grad()
408
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
409
+ def __call__(
410
+ self,
411
+ prompt: Optional[Union[str, List[str]]] = None,
412
+ negative_prompt: Optional[Union[str, List[str]]] = None,
413
+ height: Optional[int] = None,
414
+ width: Optional[int] = None,
415
+ num_inference_steps: int = 50,
416
+ timesteps: Optional[List[int]] = None,
417
+ guidance_scale: float = 5.0,
418
+ num_images_per_prompt: int = 1,
419
+ eta: float = 0.0,
420
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
421
+ latents: Optional[torch.FloatTensor] = None,
422
+ prompt_embeds: Optional[torch.FloatTensor] = None,
423
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
424
+ original_size: Optional[Tuple[int, int]] = None,
425
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
426
+ output_type: str = "pil",
427
+ return_dict: bool = True,
428
+ callback_on_step_end: Optional[
429
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
430
+ ] = None,
431
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
432
+ max_sequence_length: int = 224,
433
+ ) -> Union[CogView3PipelineOutput, Tuple]:
434
+ """
435
+ Function invoked when calling the pipeline for generation.
436
+
437
+ Args:
438
+ prompt (`str` or `List[str]`, *optional*):
439
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
440
+ negative_prompt (`str` or `List[str]`, *optional*):
441
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
442
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
443
+ less than `1`).
444
+ height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
445
+ The height in pixels of the generated image. If not provided, it is set to 1024.
446
+ width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
447
+ The width in pixels of the generated image. If not provided it is set to 1024.
448
+ num_inference_steps (`int`, *optional*, defaults to `50`):
449
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
450
+ expense of slower inference.
451
+ timesteps (`List[int]`, *optional*):
452
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
453
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
454
+ passed will be used. Must be in descending order.
455
+ guidance_scale (`float`, *optional*, defaults to `5.0`):
456
+ Guidance scale as defined in [Classifier-Free Diffusion
457
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
458
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
459
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
460
+ the text `prompt`, usually at the expense of lower image quality.
461
+ num_images_per_prompt (`int`, *optional*, defaults to `1`):
462
+ The number of images to generate per prompt.
463
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
464
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
465
+ to make generation deterministic.
466
+ latents (`torch.FloatTensor`, *optional*):
467
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
468
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
469
+ tensor will be generated by sampling using the supplied random `generator`.
470
+ prompt_embeds (`torch.FloatTensor`, *optional*):
471
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
472
+ provided, text embeddings will be generated from `prompt` input argument.
473
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
474
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
475
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
476
+ argument.
477
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
478
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
479
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
480
+ explained in section 2.2 of
481
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
482
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
483
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
484
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
485
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
486
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
487
+ output_type (`str`, *optional*, defaults to `"pil"`):
488
+ The output format of the generate image. Choose between
489
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
490
+ return_dict (`bool`, *optional*, defaults to `True`):
491
+ Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
492
+ of a plain tuple.
493
+ attention_kwargs (`dict`, *optional*):
494
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
495
+ `self.processor` in
496
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
497
+ callback_on_step_end (`Callable`, *optional*):
498
+ A function that calls at the end of each denoising steps during the inference. The function is called
499
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
500
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
501
+ `callback_on_step_end_tensor_inputs`.
502
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
503
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
504
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
505
+ `._callback_tensor_inputs` attribute of your pipeline class.
506
+ max_sequence_length (`int`, defaults to `224`):
507
+ Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
508
+
509
+ Examples:
510
+
511
+ Returns:
512
+ [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`:
513
+ [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
514
+ `tuple`. When returning a tuple, the first element is a list with the generated images.
515
+ """
516
+
517
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
518
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
519
+
520
+ height = height or self.transformer.config.sample_size * self.vae_scale_factor
521
+ width = width or self.transformer.config.sample_size * self.vae_scale_factor
522
+
523
+ original_size = original_size or (height, width)
524
+ target_size = (height, width)
525
+
526
+ # 1. Check inputs. Raise error if not correct
527
+ self.check_inputs(
528
+ prompt,
529
+ height,
530
+ width,
531
+ negative_prompt,
532
+ callback_on_step_end_tensor_inputs,
533
+ prompt_embeds,
534
+ negative_prompt_embeds,
535
+ )
536
+ self._guidance_scale = guidance_scale
537
+ self._interrupt = False
538
+
539
+ # 2. Default call parameters
540
+ if prompt is not None and isinstance(prompt, str):
541
+ batch_size = 1
542
+ elif prompt is not None and isinstance(prompt, list):
543
+ batch_size = len(prompt)
544
+ else:
545
+ batch_size = prompt_embeds.shape[0]
546
+
547
+ device = self._execution_device
548
+
549
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
550
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
551
+ # corresponds to doing no classifier free guidance.
552
+ do_classifier_free_guidance = guidance_scale > 1.0
553
+
554
+ # 3. Encode input prompt
555
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
556
+ prompt,
557
+ negative_prompt,
558
+ self.do_classifier_free_guidance,
559
+ num_images_per_prompt=num_images_per_prompt,
560
+ prompt_embeds=prompt_embeds,
561
+ negative_prompt_embeds=negative_prompt_embeds,
562
+ max_sequence_length=max_sequence_length,
563
+ device=device,
564
+ )
565
+ if self.do_classifier_free_guidance:
566
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
567
+
568
+ # 4. Prepare timesteps
569
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
570
+ self._num_timesteps = len(timesteps)
571
+
572
+ # 5. Prepare latents.
573
+ latent_channels = self.transformer.config.in_channels
574
+ latents = self.prepare_latents(
575
+ batch_size * num_images_per_prompt,
576
+ latent_channels,
577
+ height,
578
+ width,
579
+ prompt_embeds.dtype,
580
+ device,
581
+ generator,
582
+ latents,
583
+ )
584
+
585
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
586
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
587
+
588
+ # 7. Prepare additional timestep conditions
589
+ original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype)
590
+ target_size = torch.tensor([target_size], dtype=prompt_embeds.dtype)
591
+ crops_coords_top_left = torch.tensor([crops_coords_top_left], dtype=prompt_embeds.dtype)
592
+
593
+ if self.do_classifier_free_guidance:
594
+ original_size = torch.cat([original_size, original_size])
595
+ target_size = torch.cat([target_size, target_size])
596
+ crops_coords_top_left = torch.cat([crops_coords_top_left, crops_coords_top_left])
597
+
598
+ original_size = original_size.to(device).repeat(batch_size * num_images_per_prompt, 1)
599
+ target_size = target_size.to(device).repeat(batch_size * num_images_per_prompt, 1)
600
+ crops_coords_top_left = crops_coords_top_left.to(device).repeat(batch_size * num_images_per_prompt, 1)
601
+
602
+ # 8. Denoising loop
603
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
604
+
605
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
606
+ # for DPM-solver++
607
+ old_pred_original_sample = None
608
+ for i, t in enumerate(timesteps):
609
+ if self.interrupt:
610
+ continue
611
+
612
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
613
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
614
+
615
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
616
+ timestep = t.expand(latent_model_input.shape[0])
617
+
618
+ # predict noise model_output
619
+ noise_pred = self.transformer(
620
+ hidden_states=latent_model_input,
621
+ encoder_hidden_states=prompt_embeds,
622
+ timestep=timestep,
623
+ original_size=original_size,
624
+ target_size=target_size,
625
+ crop_coords=crops_coords_top_left,
626
+ return_dict=False,
627
+ )[0]
628
+ noise_pred = noise_pred.float()
629
+
630
+ # perform guidance
631
+ if self.do_classifier_free_guidance:
632
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
633
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
634
+
635
+ # compute the previous noisy sample x_t -> x_t-1
636
+ if not isinstance(self.scheduler, CogVideoXDPMScheduler):
637
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
638
+ else:
639
+ latents, old_pred_original_sample = self.scheduler.step(
640
+ noise_pred,
641
+ old_pred_original_sample,
642
+ t,
643
+ timesteps[i - 1] if i > 0 else None,
644
+ latents,
645
+ **extra_step_kwargs,
646
+ return_dict=False,
647
+ )
648
+ latents = latents.to(prompt_embeds.dtype)
649
+
650
+ # call the callback, if provided
651
+ if callback_on_step_end is not None:
652
+ callback_kwargs = {}
653
+ for k in callback_on_step_end_tensor_inputs:
654
+ callback_kwargs[k] = locals()[k]
655
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
656
+
657
+ latents = callback_outputs.pop("latents", latents)
658
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
659
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
660
+
661
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
662
+ progress_bar.update()
663
+
664
+ if XLA_AVAILABLE:
665
+ xm.mark_step()
666
+
667
+ if not output_type == "latent":
668
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
669
+ 0
670
+ ]
671
+ else:
672
+ image = latents
673
+
674
+ image = self.image_processor.postprocess(image, output_type=output_type)
675
+
676
+ # Offload all models
677
+ self.maybe_free_model_hooks()
678
+
679
+ if not return_dict:
680
+ return (image,)
681
+
682
+ return CogView3PipelineOutput(images=image)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview3/pipeline_output.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Union
3
+
4
+ import numpy as np
5
+ import PIL.Image
6
+
7
+ from ...utils import BaseOutput
8
+
9
+
10
+ @dataclass
11
+ class CogView3PipelineOutput(BaseOutput):
12
+ """
13
+ Output class for CogView3 pipelines.
14
+
15
+ Args:
16
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
17
+ List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
18
+ num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
19
+ """
20
+
21
+ images: Union[List[PIL.Image.Image], np.ndarray]
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/__init__.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_torch_available,
9
+ is_transformers_available,
10
+ )
11
+
12
+
13
+ _dummy_objects = {}
14
+ _additional_imports = {}
15
+ _import_structure = {"pipeline_output": ["CogView4PlusPipelineOutput"]}
16
+
17
+ try:
18
+ if not (is_transformers_available() and is_torch_available()):
19
+ raise OptionalDependencyNotAvailable()
20
+ except OptionalDependencyNotAvailable:
21
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
22
+
23
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
+ else:
25
+ _import_structure["pipeline_cogview4"] = ["CogView4Pipeline"]
26
+ _import_structure["pipeline_cogview4_control"] = ["CogView4ControlPipeline"]
27
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
28
+ try:
29
+ if not (is_transformers_available() and is_torch_available()):
30
+ raise OptionalDependencyNotAvailable()
31
+ except OptionalDependencyNotAvailable:
32
+ from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
33
+ else:
34
+ from .pipeline_cogview4 import CogView4Pipeline
35
+ from .pipeline_cogview4_control import CogView4ControlPipeline
36
+ else:
37
+ import sys
38
+
39
+ sys.modules[__name__] = _LazyModule(
40
+ __name__,
41
+ globals()["__file__"],
42
+ _import_structure,
43
+ module_spec=__spec__,
44
+ )
45
+
46
+ for name, value in _dummy_objects.items():
47
+ setattr(sys.modules[__name__], name, value)
48
+ for name, value in _additional_imports.items():
49
+ setattr(sys.modules[__name__], name, value)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.25 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/__pycache__/pipeline_cogview4.cpython-310.pyc ADDED
Binary file (24.5 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/__pycache__/pipeline_cogview4_control.cpython-310.pyc ADDED
Binary file (25.3 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/__pycache__/pipeline_output.cpython-310.pyc ADDED
Binary file (1.01 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/pipeline_cogview4.py ADDED
@@ -0,0 +1,685 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import inspect
17
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
18
+
19
+ import numpy as np
20
+ import torch
21
+ from transformers import AutoTokenizer, GlmModel
22
+
23
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
24
+ from ...image_processor import VaeImageProcessor
25
+ from ...loaders import CogView4LoraLoaderMixin
26
+ from ...models import AutoencoderKL, CogView4Transformer2DModel
27
+ from ...pipelines.pipeline_utils import DiffusionPipeline
28
+ from ...schedulers import FlowMatchEulerDiscreteScheduler
29
+ from ...utils import is_torch_xla_available, logging, replace_example_docstring
30
+ from ...utils.torch_utils import randn_tensor
31
+ from .pipeline_output import CogView4PipelineOutput
32
+
33
+
34
+ if is_torch_xla_available():
35
+ import torch_xla.core.xla_model as xm
36
+
37
+ XLA_AVAILABLE = True
38
+ else:
39
+ XLA_AVAILABLE = False
40
+
41
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
42
+
43
+ EXAMPLE_DOC_STRING = """
44
+ Examples:
45
+ ```python
46
+ >>> import torch
47
+ >>> from diffusers import CogView4Pipeline
48
+
49
+ >>> pipe = CogView4Pipeline.from_pretrained("THUDM/CogView4-6B", torch_dtype=torch.bfloat16)
50
+ >>> pipe.to("cuda")
51
+
52
+ >>> prompt = "A photo of an astronaut riding a horse on mars"
53
+ >>> image = pipe(prompt).images[0]
54
+ >>> image.save("output.png")
55
+ ```
56
+ """
57
+
58
+
59
+ def calculate_shift(
60
+ image_seq_len,
61
+ base_seq_len: int = 256,
62
+ base_shift: float = 0.25,
63
+ max_shift: float = 0.75,
64
+ ) -> float:
65
+ m = (image_seq_len / base_seq_len) ** 0.5
66
+ mu = m * max_shift + base_shift
67
+ return mu
68
+
69
+
70
+ def retrieve_timesteps(
71
+ scheduler,
72
+ num_inference_steps: Optional[int] = None,
73
+ device: Optional[Union[str, torch.device]] = None,
74
+ timesteps: Optional[List[int]] = None,
75
+ sigmas: Optional[List[float]] = None,
76
+ **kwargs,
77
+ ):
78
+ r"""
79
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
80
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
81
+
82
+ Args:
83
+ scheduler (`SchedulerMixin`):
84
+ The scheduler to get timesteps from.
85
+ num_inference_steps (`int`):
86
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
87
+ must be `None`.
88
+ device (`str` or `torch.device`, *optional*):
89
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
90
+ timesteps (`List[int]`, *optional*):
91
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
92
+ `num_inference_steps` and `sigmas` must be `None`.
93
+ sigmas (`List[float]`, *optional*):
94
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
95
+ `num_inference_steps` and `timesteps` must be `None`.
96
+
97
+ Returns:
98
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
99
+ second element is the number of inference steps.
100
+ """
101
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
102
+ accepts_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
103
+
104
+ if timesteps is not None and sigmas is not None:
105
+ if not accepts_timesteps and not accepts_sigmas:
106
+ raise ValueError(
107
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
108
+ f" timestep or sigma schedules. Please check whether you are using the correct scheduler."
109
+ )
110
+ scheduler.set_timesteps(timesteps=timesteps, sigmas=sigmas, device=device, **kwargs)
111
+ timesteps = scheduler.timesteps
112
+ num_inference_steps = len(timesteps)
113
+ elif timesteps is not None and sigmas is None:
114
+ if not accepts_timesteps:
115
+ raise ValueError(
116
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
117
+ f" timestep schedules. Please check whether you are using the correct scheduler."
118
+ )
119
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
120
+ timesteps = scheduler.timesteps
121
+ num_inference_steps = len(timesteps)
122
+ elif timesteps is None and sigmas is not None:
123
+ if not accepts_sigmas:
124
+ raise ValueError(
125
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
126
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
127
+ )
128
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
129
+ timesteps = scheduler.timesteps
130
+ num_inference_steps = len(timesteps)
131
+ else:
132
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
133
+ timesteps = scheduler.timesteps
134
+ return timesteps, num_inference_steps
135
+
136
+
137
+ class CogView4Pipeline(DiffusionPipeline, CogView4LoraLoaderMixin):
138
+ r"""
139
+ Pipeline for text-to-image generation using CogView4.
140
+
141
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
142
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
143
+
144
+ Args:
145
+ vae ([`AutoencoderKL`]):
146
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
147
+ text_encoder ([`GLMModel`]):
148
+ Frozen text-encoder. CogView4 uses [glm-4-9b-hf](https://huggingface.co/THUDM/glm-4-9b-hf).
149
+ tokenizer (`PreTrainedTokenizer`):
150
+ Tokenizer of class
151
+ [PreTrainedTokenizer](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizer).
152
+ transformer ([`CogView4Transformer2DModel`]):
153
+ A text conditioned `CogView4Transformer2DModel` to denoise the encoded image latents.
154
+ scheduler ([`SchedulerMixin`]):
155
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
156
+ """
157
+
158
+ _optional_components = []
159
+ model_cpu_offload_seq = "text_encoder->transformer->vae"
160
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
161
+
162
+ def __init__(
163
+ self,
164
+ tokenizer: AutoTokenizer,
165
+ text_encoder: GlmModel,
166
+ vae: AutoencoderKL,
167
+ transformer: CogView4Transformer2DModel,
168
+ scheduler: FlowMatchEulerDiscreteScheduler,
169
+ ):
170
+ super().__init__()
171
+
172
+ self.register_modules(
173
+ tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
174
+ )
175
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
176
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
177
+
178
+ def _get_glm_embeds(
179
+ self,
180
+ prompt: Union[str, List[str]] = None,
181
+ max_sequence_length: int = 1024,
182
+ device: Optional[torch.device] = None,
183
+ dtype: Optional[torch.dtype] = None,
184
+ ):
185
+ device = device or self._execution_device
186
+ dtype = dtype or self.text_encoder.dtype
187
+
188
+ prompt = [prompt] if isinstance(prompt, str) else prompt
189
+
190
+ text_inputs = self.tokenizer(
191
+ prompt,
192
+ padding="longest", # not use max length
193
+ max_length=max_sequence_length,
194
+ truncation=True,
195
+ add_special_tokens=True,
196
+ return_tensors="pt",
197
+ )
198
+ text_input_ids = text_inputs.input_ids
199
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
200
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
201
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
202
+ logger.warning(
203
+ "The following part of your input was truncated because `max_sequence_length` is set to "
204
+ f" {max_sequence_length} tokens: {removed_text}"
205
+ )
206
+ current_length = text_input_ids.shape[1]
207
+ pad_length = (16 - (current_length % 16)) % 16
208
+ if pad_length > 0:
209
+ pad_ids = torch.full(
210
+ (text_input_ids.shape[0], pad_length),
211
+ fill_value=self.tokenizer.pad_token_id,
212
+ dtype=text_input_ids.dtype,
213
+ device=text_input_ids.device,
214
+ )
215
+ text_input_ids = torch.cat([pad_ids, text_input_ids], dim=1)
216
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=True).hidden_states[-2]
217
+
218
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
219
+ return prompt_embeds
220
+
221
+ def encode_prompt(
222
+ self,
223
+ prompt: Union[str, List[str]],
224
+ negative_prompt: Optional[Union[str, List[str]]] = None,
225
+ do_classifier_free_guidance: bool = True,
226
+ num_images_per_prompt: int = 1,
227
+ prompt_embeds: Optional[torch.Tensor] = None,
228
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
229
+ device: Optional[torch.device] = None,
230
+ dtype: Optional[torch.dtype] = None,
231
+ max_sequence_length: int = 1024,
232
+ ):
233
+ r"""
234
+ Encodes the prompt into text encoder hidden states.
235
+
236
+ Args:
237
+ prompt (`str` or `List[str]`, *optional*):
238
+ prompt to be encoded
239
+ negative_prompt (`str` or `List[str]`, *optional*):
240
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
241
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
242
+ less than `1`).
243
+ do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
244
+ Whether to use classifier free guidance or not.
245
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
246
+ Number of images that should be generated per prompt. torch device to place the resulting embeddings on
247
+ prompt_embeds (`torch.Tensor`, *optional*):
248
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
249
+ provided, text embeddings will be generated from `prompt` input argument.
250
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
251
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
252
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
253
+ argument.
254
+ device: (`torch.device`, *optional*):
255
+ torch device
256
+ dtype: (`torch.dtype`, *optional*):
257
+ torch dtype
258
+ max_sequence_length (`int`, defaults to `1024`):
259
+ Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
260
+ """
261
+ device = device or self._execution_device
262
+
263
+ prompt = [prompt] if isinstance(prompt, str) else prompt
264
+ if prompt is not None:
265
+ batch_size = len(prompt)
266
+ else:
267
+ batch_size = prompt_embeds.shape[0]
268
+
269
+ if prompt_embeds is None:
270
+ prompt_embeds = self._get_glm_embeds(prompt, max_sequence_length, device, dtype)
271
+
272
+ seq_len = prompt_embeds.size(1)
273
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
274
+ prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
275
+
276
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
277
+ negative_prompt = negative_prompt or ""
278
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
279
+
280
+ if prompt is not None and type(prompt) is not type(negative_prompt):
281
+ raise TypeError(
282
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
283
+ f" {type(prompt)}."
284
+ )
285
+ elif batch_size != len(negative_prompt):
286
+ raise ValueError(
287
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
288
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
289
+ " the batch size of `prompt`."
290
+ )
291
+
292
+ negative_prompt_embeds = self._get_glm_embeds(negative_prompt, max_sequence_length, device, dtype)
293
+
294
+ seq_len = negative_prompt_embeds.size(1)
295
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
296
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
297
+
298
+ return prompt_embeds, negative_prompt_embeds
299
+
300
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
301
+ if latents is not None:
302
+ return latents.to(device)
303
+
304
+ shape = (
305
+ batch_size,
306
+ num_channels_latents,
307
+ int(height) // self.vae_scale_factor,
308
+ int(width) // self.vae_scale_factor,
309
+ )
310
+ if isinstance(generator, list) and len(generator) != batch_size:
311
+ raise ValueError(
312
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
313
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
314
+ )
315
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
316
+ return latents
317
+
318
+ def check_inputs(
319
+ self,
320
+ prompt,
321
+ height,
322
+ width,
323
+ negative_prompt,
324
+ callback_on_step_end_tensor_inputs,
325
+ prompt_embeds=None,
326
+ negative_prompt_embeds=None,
327
+ ):
328
+ if height % 16 != 0 or width % 16 != 0:
329
+ raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
330
+
331
+ if callback_on_step_end_tensor_inputs is not None and not all(
332
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
333
+ ):
334
+ raise ValueError(
335
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
336
+ )
337
+ if prompt is not None and prompt_embeds is not None:
338
+ raise ValueError(
339
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
340
+ " only forward one of the two."
341
+ )
342
+ elif prompt is None and prompt_embeds is None:
343
+ raise ValueError(
344
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
345
+ )
346
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
347
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
348
+
349
+ if prompt is not None and negative_prompt_embeds is not None:
350
+ raise ValueError(
351
+ f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
352
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
353
+ )
354
+
355
+ if negative_prompt is not None and negative_prompt_embeds is not None:
356
+ raise ValueError(
357
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
358
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
359
+ )
360
+
361
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
362
+ if prompt_embeds.shape[0] != negative_prompt_embeds.shape[0]:
363
+ raise ValueError(
364
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same batch size when passed directly, but"
365
+ f" got: `prompt_embeds` {prompt_embeds.shape} and `negative_prompt_embeds`"
366
+ f" {negative_prompt_embeds.shape}."
367
+ )
368
+ if prompt_embeds.shape[-1] != negative_prompt_embeds.shape[-1]:
369
+ raise ValueError(
370
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same dimension when passed directly, but"
371
+ f" got: `prompt_embeds` {prompt_embeds.shape} and `negative_prompt_embeds`"
372
+ f" {negative_prompt_embeds.shape}."
373
+ )
374
+
375
+ @property
376
+ def guidance_scale(self):
377
+ return self._guidance_scale
378
+
379
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
380
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
381
+ # corresponds to doing no classifier free guidance.
382
+ @property
383
+ def do_classifier_free_guidance(self):
384
+ return self._guidance_scale > 1
385
+
386
+ @property
387
+ def num_timesteps(self):
388
+ return self._num_timesteps
389
+
390
+ @property
391
+ def attention_kwargs(self):
392
+ return self._attention_kwargs
393
+
394
+ @property
395
+ def current_timestep(self):
396
+ return self._current_timestep
397
+
398
+ @property
399
+ def interrupt(self):
400
+ return self._interrupt
401
+
402
+ @torch.no_grad()
403
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
404
+ def __call__(
405
+ self,
406
+ prompt: Optional[Union[str, List[str]]] = None,
407
+ negative_prompt: Optional[Union[str, List[str]]] = None,
408
+ height: Optional[int] = None,
409
+ width: Optional[int] = None,
410
+ num_inference_steps: int = 50,
411
+ timesteps: Optional[List[int]] = None,
412
+ sigmas: Optional[List[float]] = None,
413
+ guidance_scale: float = 5.0,
414
+ num_images_per_prompt: int = 1,
415
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
416
+ latents: Optional[torch.FloatTensor] = None,
417
+ prompt_embeds: Optional[torch.FloatTensor] = None,
418
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
419
+ original_size: Optional[Tuple[int, int]] = None,
420
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
421
+ output_type: str = "pil",
422
+ return_dict: bool = True,
423
+ attention_kwargs: Optional[Dict[str, Any]] = None,
424
+ callback_on_step_end: Optional[
425
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
426
+ ] = None,
427
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
428
+ max_sequence_length: int = 1024,
429
+ ) -> Union[CogView4PipelineOutput, Tuple]:
430
+ """
431
+ Function invoked when calling the pipeline for generation.
432
+
433
+ Args:
434
+ prompt (`str` or `List[str]`, *optional*):
435
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
436
+ negative_prompt (`str` or `List[str]`, *optional*):
437
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
438
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
439
+ less than `1`).
440
+ height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
441
+ The height in pixels of the generated image. If not provided, it is set to 1024.
442
+ width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
443
+ The width in pixels of the generated image. If not provided it is set to 1024.
444
+ num_inference_steps (`int`, *optional*, defaults to `50`):
445
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
446
+ expense of slower inference.
447
+ timesteps (`List[int]`, *optional*):
448
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
449
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
450
+ passed will be used. Must be in descending order.
451
+ sigmas (`List[float]`, *optional*):
452
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
453
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
454
+ will be used.
455
+ guidance_scale (`float`, *optional*, defaults to `5.0`):
456
+ Guidance scale as defined in [Classifier-Free Diffusion
457
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
458
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
459
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
460
+ the text `prompt`, usually at the expense of lower image quality.
461
+ num_images_per_prompt (`int`, *optional*, defaults to `1`):
462
+ The number of images to generate per prompt.
463
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
464
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
465
+ to make generation deterministic.
466
+ latents (`torch.FloatTensor`, *optional*):
467
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
468
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
469
+ tensor will be generated by sampling using the supplied random `generator`.
470
+ prompt_embeds (`torch.FloatTensor`, *optional*):
471
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
472
+ provided, text embeddings will be generated from `prompt` input argument.
473
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
474
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
475
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
476
+ argument.
477
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
478
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
479
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
480
+ explained in section 2.2 of
481
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
482
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
483
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
484
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
485
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
486
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
487
+ output_type (`str`, *optional*, defaults to `"pil"`):
488
+ The output format of the generate image. Choose between
489
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
490
+ return_dict (`bool`, *optional*, defaults to `True`):
491
+ Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
492
+ of a plain tuple.
493
+ attention_kwargs (`dict`, *optional*):
494
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
495
+ `self.processor` in
496
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
497
+ callback_on_step_end (`Callable`, *optional*):
498
+ A function that calls at the end of each denoising steps during the inference. The function is called
499
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
500
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
501
+ `callback_on_step_end_tensor_inputs`.
502
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
503
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
504
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
505
+ `._callback_tensor_inputs` attribute of your pipeline class.
506
+ max_sequence_length (`int`, defaults to `224`):
507
+ Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
508
+
509
+ Examples:
510
+
511
+ Returns:
512
+ [`~pipelines.cogview4.pipeline_CogView4.CogView4PipelineOutput`] or `tuple`:
513
+ [`~pipelines.cogview4.pipeline_CogView4.CogView4PipelineOutput`] if `return_dict` is True, otherwise a
514
+ `tuple`. When returning a tuple, the first element is a list with the generated images.
515
+ """
516
+
517
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
518
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
519
+
520
+ height = height or self.transformer.config.sample_size * self.vae_scale_factor
521
+ width = width or self.transformer.config.sample_size * self.vae_scale_factor
522
+
523
+ original_size = original_size or (height, width)
524
+ target_size = (height, width)
525
+
526
+ # Check inputs. Raise error if not correct
527
+ self.check_inputs(
528
+ prompt,
529
+ height,
530
+ width,
531
+ negative_prompt,
532
+ callback_on_step_end_tensor_inputs,
533
+ prompt_embeds,
534
+ negative_prompt_embeds,
535
+ )
536
+ self._guidance_scale = guidance_scale
537
+ self._attention_kwargs = attention_kwargs
538
+ self._current_timestep = None
539
+ self._interrupt = False
540
+
541
+ # Default call parameters
542
+ if prompt is not None and isinstance(prompt, str):
543
+ batch_size = 1
544
+ elif prompt is not None and isinstance(prompt, list):
545
+ batch_size = len(prompt)
546
+ else:
547
+ batch_size = prompt_embeds.shape[0]
548
+
549
+ device = self._execution_device
550
+
551
+ # Encode input prompt
552
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
553
+ prompt,
554
+ negative_prompt,
555
+ self.do_classifier_free_guidance,
556
+ num_images_per_prompt=num_images_per_prompt,
557
+ prompt_embeds=prompt_embeds,
558
+ negative_prompt_embeds=negative_prompt_embeds,
559
+ max_sequence_length=max_sequence_length,
560
+ device=device,
561
+ )
562
+
563
+ # Prepare latents
564
+ latent_channels = self.transformer.config.in_channels
565
+ latents = self.prepare_latents(
566
+ batch_size * num_images_per_prompt,
567
+ latent_channels,
568
+ height,
569
+ width,
570
+ torch.float32,
571
+ device,
572
+ generator,
573
+ latents,
574
+ )
575
+
576
+ # Prepare additional timestep conditions
577
+ original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype, device=device)
578
+ target_size = torch.tensor([target_size], dtype=prompt_embeds.dtype, device=device)
579
+ crops_coords_top_left = torch.tensor([crops_coords_top_left], dtype=prompt_embeds.dtype, device=device)
580
+
581
+ original_size = original_size.repeat(batch_size * num_images_per_prompt, 1)
582
+ target_size = target_size.repeat(batch_size * num_images_per_prompt, 1)
583
+ crops_coords_top_left = crops_coords_top_left.repeat(batch_size * num_images_per_prompt, 1)
584
+
585
+ # Prepare timesteps
586
+ image_seq_len = ((height // self.vae_scale_factor) * (width // self.vae_scale_factor)) // (
587
+ self.transformer.config.patch_size**2
588
+ )
589
+ timesteps = (
590
+ np.linspace(self.scheduler.config.num_train_timesteps, 1.0, num_inference_steps)
591
+ if timesteps is None
592
+ else np.array(timesteps)
593
+ )
594
+ timesteps = timesteps.astype(np.int64).astype(np.float32)
595
+ sigmas = timesteps / self.scheduler.config.num_train_timesteps if sigmas is None else sigmas
596
+ mu = calculate_shift(
597
+ image_seq_len,
598
+ self.scheduler.config.get("base_image_seq_len", 256),
599
+ self.scheduler.config.get("base_shift", 0.25),
600
+ self.scheduler.config.get("max_shift", 0.75),
601
+ )
602
+ timesteps, num_inference_steps = retrieve_timesteps(
603
+ self.scheduler, num_inference_steps, device, timesteps, sigmas, mu=mu
604
+ )
605
+ self._num_timesteps = len(timesteps)
606
+
607
+ # Denoising loop
608
+ transformer_dtype = self.transformer.dtype
609
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
610
+
611
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
612
+ for i, t in enumerate(timesteps):
613
+ if self.interrupt:
614
+ continue
615
+
616
+ self._current_timestep = t
617
+ latent_model_input = latents.to(transformer_dtype)
618
+
619
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
620
+ timestep = t.expand(latents.shape[0])
621
+
622
+ with self.transformer.cache_context("cond"):
623
+ noise_pred_cond = self.transformer(
624
+ hidden_states=latent_model_input,
625
+ encoder_hidden_states=prompt_embeds,
626
+ timestep=timestep,
627
+ original_size=original_size,
628
+ target_size=target_size,
629
+ crop_coords=crops_coords_top_left,
630
+ attention_kwargs=attention_kwargs,
631
+ return_dict=False,
632
+ )[0]
633
+
634
+ # perform guidance
635
+ if self.do_classifier_free_guidance:
636
+ with self.transformer.cache_context("uncond"):
637
+ noise_pred_uncond = self.transformer(
638
+ hidden_states=latent_model_input,
639
+ encoder_hidden_states=negative_prompt_embeds,
640
+ timestep=timestep,
641
+ original_size=original_size,
642
+ target_size=target_size,
643
+ crop_coords=crops_coords_top_left,
644
+ attention_kwargs=attention_kwargs,
645
+ return_dict=False,
646
+ )[0]
647
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
648
+ else:
649
+ noise_pred = noise_pred_cond
650
+
651
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
652
+
653
+ # call the callback, if provided
654
+ if callback_on_step_end is not None:
655
+ callback_kwargs = {}
656
+ for k in callback_on_step_end_tensor_inputs:
657
+ callback_kwargs[k] = locals()[k]
658
+ callback_outputs = callback_on_step_end(self, i, self.scheduler.sigmas[i], callback_kwargs)
659
+ latents = callback_outputs.pop("latents", latents)
660
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
661
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
662
+
663
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
664
+ progress_bar.update()
665
+
666
+ if XLA_AVAILABLE:
667
+ xm.mark_step()
668
+
669
+ self._current_timestep = None
670
+
671
+ if not output_type == "latent":
672
+ latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
673
+ image = self.vae.decode(latents, return_dict=False, generator=generator)[0]
674
+ else:
675
+ image = latents
676
+
677
+ image = self.image_processor.postprocess(image, output_type=output_type)
678
+
679
+ # Offload all models
680
+ self.maybe_free_model_hooks()
681
+
682
+ if not return_dict:
683
+ return (image,)
684
+
685
+ return CogView4PipelineOutput(images=image)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/pipeline_cogview4_control.py ADDED
@@ -0,0 +1,732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import inspect
17
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
18
+
19
+ import numpy as np
20
+ import torch
21
+ from transformers import AutoTokenizer, GlmModel
22
+
23
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
24
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
25
+ from ...models import AutoencoderKL, CogView4Transformer2DModel
26
+ from ...pipelines.pipeline_utils import DiffusionPipeline
27
+ from ...schedulers import FlowMatchEulerDiscreteScheduler
28
+ from ...utils import is_torch_xla_available, logging, replace_example_docstring
29
+ from ...utils.torch_utils import randn_tensor
30
+ from .pipeline_output import CogView4PipelineOutput
31
+
32
+
33
+ if is_torch_xla_available():
34
+ import torch_xla.core.xla_model as xm
35
+
36
+ XLA_AVAILABLE = True
37
+ else:
38
+ XLA_AVAILABLE = False
39
+
40
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
41
+
42
+ EXAMPLE_DOC_STRING = """
43
+ Examples:
44
+ ```python
45
+ >>> import torch
46
+ >>> from diffusers import CogView4ControlPipeline
47
+
48
+ >>> pipe = CogView4ControlPipeline.from_pretrained("THUDM/CogView4-6B-Control", torch_dtype=torch.bfloat16)
49
+ >>> control_image = load_image(
50
+ ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
51
+ ... )
52
+ >>> prompt = "A bird in space"
53
+ >>> image = pipe(prompt, control_image=control_image, height=1024, width=1024, guidance_scale=3.5).images[0]
54
+ >>> image.save("cogview4-control.png")
55
+ ```
56
+ """
57
+
58
+
59
+ # Copied from diffusers.pipelines.cogview4.pipeline_cogview4.calculate_shift
60
+ def calculate_shift(
61
+ image_seq_len,
62
+ base_seq_len: int = 256,
63
+ base_shift: float = 0.25,
64
+ max_shift: float = 0.75,
65
+ ) -> float:
66
+ m = (image_seq_len / base_seq_len) ** 0.5
67
+ mu = m * max_shift + base_shift
68
+ return mu
69
+
70
+
71
+ # Copied from diffusers.pipelines.cogview4.pipeline_cogview4.retrieve_timesteps
72
+ def retrieve_timesteps(
73
+ scheduler,
74
+ num_inference_steps: Optional[int] = None,
75
+ device: Optional[Union[str, torch.device]] = None,
76
+ timesteps: Optional[List[int]] = None,
77
+ sigmas: Optional[List[float]] = None,
78
+ **kwargs,
79
+ ):
80
+ r"""
81
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
82
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
83
+
84
+ Args:
85
+ scheduler (`SchedulerMixin`):
86
+ The scheduler to get timesteps from.
87
+ num_inference_steps (`int`):
88
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
89
+ must be `None`.
90
+ device (`str` or `torch.device`, *optional*):
91
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
92
+ timesteps (`List[int]`, *optional*):
93
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
94
+ `num_inference_steps` and `sigmas` must be `None`.
95
+ sigmas (`List[float]`, *optional*):
96
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
97
+ `num_inference_steps` and `timesteps` must be `None`.
98
+
99
+ Returns:
100
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
101
+ second element is the number of inference steps.
102
+ """
103
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
104
+ accepts_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
105
+
106
+ if timesteps is not None and sigmas is not None:
107
+ if not accepts_timesteps and not accepts_sigmas:
108
+ raise ValueError(
109
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
110
+ f" timestep or sigma schedules. Please check whether you are using the correct scheduler."
111
+ )
112
+ scheduler.set_timesteps(timesteps=timesteps, sigmas=sigmas, device=device, **kwargs)
113
+ timesteps = scheduler.timesteps
114
+ num_inference_steps = len(timesteps)
115
+ elif timesteps is not None and sigmas is None:
116
+ if not accepts_timesteps:
117
+ raise ValueError(
118
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
119
+ f" timestep schedules. Please check whether you are using the correct scheduler."
120
+ )
121
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
122
+ timesteps = scheduler.timesteps
123
+ num_inference_steps = len(timesteps)
124
+ elif timesteps is None and sigmas is not None:
125
+ if not accepts_sigmas:
126
+ raise ValueError(
127
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
128
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
129
+ )
130
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
131
+ timesteps = scheduler.timesteps
132
+ num_inference_steps = len(timesteps)
133
+ else:
134
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
135
+ timesteps = scheduler.timesteps
136
+ return timesteps, num_inference_steps
137
+
138
+
139
+ class CogView4ControlPipeline(DiffusionPipeline):
140
+ r"""
141
+ Pipeline for text-to-image generation using CogView4.
142
+
143
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
144
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
145
+
146
+ Args:
147
+ vae ([`AutoencoderKL`]):
148
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
149
+ text_encoder ([`GLMModel`]):
150
+ Frozen text-encoder. CogView4 uses [glm-4-9b-hf](https://huggingface.co/THUDM/glm-4-9b-hf).
151
+ tokenizer (`PreTrainedTokenizer`):
152
+ Tokenizer of class
153
+ [PreTrainedTokenizer](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizer).
154
+ transformer ([`CogView4Transformer2DModel`]):
155
+ A text conditioned `CogView4Transformer2DModel` to denoise the encoded image latents.
156
+ scheduler ([`SchedulerMixin`]):
157
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
158
+ """
159
+
160
+ _optional_components = []
161
+ model_cpu_offload_seq = "text_encoder->transformer->vae"
162
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
163
+
164
+ def __init__(
165
+ self,
166
+ tokenizer: AutoTokenizer,
167
+ text_encoder: GlmModel,
168
+ vae: AutoencoderKL,
169
+ transformer: CogView4Transformer2DModel,
170
+ scheduler: FlowMatchEulerDiscreteScheduler,
171
+ ):
172
+ super().__init__()
173
+
174
+ self.register_modules(
175
+ tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
176
+ )
177
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
178
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
179
+
180
+ # Copied from diffusers.pipelines.cogview4.pipeline_cogview4.CogView4Pipeline._get_glm_embeds
181
+ def _get_glm_embeds(
182
+ self,
183
+ prompt: Union[str, List[str]] = None,
184
+ max_sequence_length: int = 1024,
185
+ device: Optional[torch.device] = None,
186
+ dtype: Optional[torch.dtype] = None,
187
+ ):
188
+ device = device or self._execution_device
189
+ dtype = dtype or self.text_encoder.dtype
190
+
191
+ prompt = [prompt] if isinstance(prompt, str) else prompt
192
+
193
+ text_inputs = self.tokenizer(
194
+ prompt,
195
+ padding="longest", # not use max length
196
+ max_length=max_sequence_length,
197
+ truncation=True,
198
+ add_special_tokens=True,
199
+ return_tensors="pt",
200
+ )
201
+ text_input_ids = text_inputs.input_ids
202
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
203
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
204
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
205
+ logger.warning(
206
+ "The following part of your input was truncated because `max_sequence_length` is set to "
207
+ f" {max_sequence_length} tokens: {removed_text}"
208
+ )
209
+ current_length = text_input_ids.shape[1]
210
+ pad_length = (16 - (current_length % 16)) % 16
211
+ if pad_length > 0:
212
+ pad_ids = torch.full(
213
+ (text_input_ids.shape[0], pad_length),
214
+ fill_value=self.tokenizer.pad_token_id,
215
+ dtype=text_input_ids.dtype,
216
+ device=text_input_ids.device,
217
+ )
218
+ text_input_ids = torch.cat([pad_ids, text_input_ids], dim=1)
219
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=True).hidden_states[-2]
220
+
221
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
222
+ return prompt_embeds
223
+
224
+ # Copied from diffusers.pipelines.cogview4.pipeline_cogview4.CogView4Pipeline.encode_prompt
225
+ def encode_prompt(
226
+ self,
227
+ prompt: Union[str, List[str]],
228
+ negative_prompt: Optional[Union[str, List[str]]] = None,
229
+ do_classifier_free_guidance: bool = True,
230
+ num_images_per_prompt: int = 1,
231
+ prompt_embeds: Optional[torch.Tensor] = None,
232
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
233
+ device: Optional[torch.device] = None,
234
+ dtype: Optional[torch.dtype] = None,
235
+ max_sequence_length: int = 1024,
236
+ ):
237
+ r"""
238
+ Encodes the prompt into text encoder hidden states.
239
+
240
+ Args:
241
+ prompt (`str` or `List[str]`, *optional*):
242
+ prompt to be encoded
243
+ negative_prompt (`str` or `List[str]`, *optional*):
244
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
245
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
246
+ less than `1`).
247
+ do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
248
+ Whether to use classifier free guidance or not.
249
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
250
+ Number of images that should be generated per prompt. torch device to place the resulting embeddings on
251
+ prompt_embeds (`torch.Tensor`, *optional*):
252
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
253
+ provided, text embeddings will be generated from `prompt` input argument.
254
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
255
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
256
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
257
+ argument.
258
+ device: (`torch.device`, *optional*):
259
+ torch device
260
+ dtype: (`torch.dtype`, *optional*):
261
+ torch dtype
262
+ max_sequence_length (`int`, defaults to `1024`):
263
+ Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
264
+ """
265
+ device = device or self._execution_device
266
+
267
+ prompt = [prompt] if isinstance(prompt, str) else prompt
268
+ if prompt is not None:
269
+ batch_size = len(prompt)
270
+ else:
271
+ batch_size = prompt_embeds.shape[0]
272
+
273
+ if prompt_embeds is None:
274
+ prompt_embeds = self._get_glm_embeds(prompt, max_sequence_length, device, dtype)
275
+
276
+ seq_len = prompt_embeds.size(1)
277
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
278
+ prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
279
+
280
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
281
+ negative_prompt = negative_prompt or ""
282
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
283
+
284
+ if prompt is not None and type(prompt) is not type(negative_prompt):
285
+ raise TypeError(
286
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
287
+ f" {type(prompt)}."
288
+ )
289
+ elif batch_size != len(negative_prompt):
290
+ raise ValueError(
291
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
292
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
293
+ " the batch size of `prompt`."
294
+ )
295
+
296
+ negative_prompt_embeds = self._get_glm_embeds(negative_prompt, max_sequence_length, device, dtype)
297
+
298
+ seq_len = negative_prompt_embeds.size(1)
299
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
300
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
301
+
302
+ return prompt_embeds, negative_prompt_embeds
303
+
304
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
305
+ if latents is not None:
306
+ return latents.to(device)
307
+
308
+ shape = (
309
+ batch_size,
310
+ num_channels_latents,
311
+ int(height) // self.vae_scale_factor,
312
+ int(width) // self.vae_scale_factor,
313
+ )
314
+ if isinstance(generator, list) and len(generator) != batch_size:
315
+ raise ValueError(
316
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
317
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
318
+ )
319
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
320
+ return latents
321
+
322
+ def prepare_image(
323
+ self,
324
+ image,
325
+ width,
326
+ height,
327
+ batch_size,
328
+ num_images_per_prompt,
329
+ device,
330
+ dtype,
331
+ do_classifier_free_guidance=False,
332
+ guess_mode=False,
333
+ ):
334
+ if isinstance(image, torch.Tensor):
335
+ pass
336
+ else:
337
+ image = self.image_processor.preprocess(image, height=height, width=width)
338
+
339
+ image_batch_size = image.shape[0]
340
+
341
+ if image_batch_size == 1:
342
+ repeat_by = batch_size
343
+ else:
344
+ # image batch size is the same as prompt batch size
345
+ repeat_by = num_images_per_prompt
346
+
347
+ image = image.repeat_interleave(repeat_by, dim=0, output_size=image.shape[0] * repeat_by)
348
+
349
+ image = image.to(device=device, dtype=dtype)
350
+
351
+ if do_classifier_free_guidance and not guess_mode:
352
+ image = torch.cat([image] * 2)
353
+
354
+ return image
355
+
356
+ def check_inputs(
357
+ self,
358
+ prompt,
359
+ height,
360
+ width,
361
+ negative_prompt,
362
+ callback_on_step_end_tensor_inputs,
363
+ prompt_embeds=None,
364
+ negative_prompt_embeds=None,
365
+ ):
366
+ if height % 16 != 0 or width % 16 != 0:
367
+ raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
368
+
369
+ if callback_on_step_end_tensor_inputs is not None and not all(
370
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
371
+ ):
372
+ raise ValueError(
373
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
374
+ )
375
+ if prompt is not None and prompt_embeds is not None:
376
+ raise ValueError(
377
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
378
+ " only forward one of the two."
379
+ )
380
+ elif prompt is None and prompt_embeds is None:
381
+ raise ValueError(
382
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
383
+ )
384
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
385
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
386
+
387
+ if prompt is not None and negative_prompt_embeds is not None:
388
+ raise ValueError(
389
+ f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
390
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
391
+ )
392
+
393
+ if negative_prompt is not None and negative_prompt_embeds is not None:
394
+ raise ValueError(
395
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
396
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
397
+ )
398
+
399
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
400
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
401
+ raise ValueError(
402
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
403
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
404
+ f" {negative_prompt_embeds.shape}."
405
+ )
406
+
407
+ @property
408
+ def guidance_scale(self):
409
+ return self._guidance_scale
410
+
411
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
412
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
413
+ # corresponds to doing no classifier free guidance.
414
+ @property
415
+ def do_classifier_free_guidance(self):
416
+ return self._guidance_scale > 1
417
+
418
+ @property
419
+ def num_timesteps(self):
420
+ return self._num_timesteps
421
+
422
+ @property
423
+ def attention_kwargs(self):
424
+ return self._attention_kwargs
425
+
426
+ @property
427
+ def current_timestep(self):
428
+ return self._current_timestep
429
+
430
+ @property
431
+ def interrupt(self):
432
+ return self._interrupt
433
+
434
+ @torch.no_grad()
435
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
436
+ def __call__(
437
+ self,
438
+ prompt: Optional[Union[str, List[str]]] = None,
439
+ negative_prompt: Optional[Union[str, List[str]]] = None,
440
+ control_image: PipelineImageInput = None,
441
+ height: Optional[int] = None,
442
+ width: Optional[int] = None,
443
+ num_inference_steps: int = 50,
444
+ timesteps: Optional[List[int]] = None,
445
+ sigmas: Optional[List[float]] = None,
446
+ guidance_scale: float = 5.0,
447
+ num_images_per_prompt: int = 1,
448
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
449
+ latents: Optional[torch.FloatTensor] = None,
450
+ prompt_embeds: Optional[torch.FloatTensor] = None,
451
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
452
+ original_size: Optional[Tuple[int, int]] = None,
453
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
454
+ output_type: str = "pil",
455
+ return_dict: bool = True,
456
+ attention_kwargs: Optional[Dict[str, Any]] = None,
457
+ callback_on_step_end: Optional[
458
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
459
+ ] = None,
460
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
461
+ max_sequence_length: int = 1024,
462
+ ) -> Union[CogView4PipelineOutput, Tuple]:
463
+ """
464
+ Function invoked when calling the pipeline for generation.
465
+
466
+ Args:
467
+ prompt (`str` or `List[str]`, *optional*):
468
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
469
+ negative_prompt (`str` or `List[str]`, *optional*):
470
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
471
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
472
+ less than `1`).
473
+ height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
474
+ The height in pixels of the generated image. If not provided, it is set to 1024.
475
+ width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
476
+ The width in pixels of the generated image. If not provided it is set to 1024.
477
+ num_inference_steps (`int`, *optional*, defaults to `50`):
478
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
479
+ expense of slower inference.
480
+ timesteps (`List[int]`, *optional*):
481
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
482
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
483
+ passed will be used. Must be in descending order.
484
+ sigmas (`List[float]`, *optional*):
485
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
486
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
487
+ will be used.
488
+ guidance_scale (`float`, *optional*, defaults to `5.0`):
489
+ Guidance scale as defined in [Classifier-Free Diffusion
490
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
491
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
492
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
493
+ the text `prompt`, usually at the expense of lower image quality.
494
+ num_images_per_prompt (`int`, *optional*, defaults to `1`):
495
+ The number of images to generate per prompt.
496
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
497
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
498
+ to make generation deterministic.
499
+ latents (`torch.FloatTensor`, *optional*):
500
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
501
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
502
+ tensor will be generated by sampling using the supplied random `generator`.
503
+ prompt_embeds (`torch.FloatTensor`, *optional*):
504
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
505
+ provided, text embeddings will be generated from `prompt` input argument.
506
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
507
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
508
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
509
+ argument.
510
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
511
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
512
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
513
+ explained in section 2.2 of
514
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
515
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
516
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
517
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
518
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
519
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
520
+ output_type (`str`, *optional*, defaults to `"pil"`):
521
+ The output format of the generate image. Choose between
522
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
523
+ return_dict (`bool`, *optional*, defaults to `True`):
524
+ Whether or not to return a [`~pipelines.pipeline_CogView4.CogView4PipelineOutput`] instead of a plain
525
+ tuple.
526
+ attention_kwargs (`dict`, *optional*):
527
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
528
+ `self.processor` in
529
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
530
+ callback_on_step_end (`Callable`, *optional*):
531
+ A function that calls at the end of each denoising steps during the inference. The function is called
532
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
533
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
534
+ `callback_on_step_end_tensor_inputs`.
535
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
536
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
537
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
538
+ `._callback_tensor_inputs` attribute of your pipeline class.
539
+ max_sequence_length (`int`, defaults to `224`):
540
+ Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
541
+ Examples:
542
+
543
+ Returns:
544
+ [`~pipelines.cogview4.pipeline_CogView4.CogView4PipelineOutput`] or `tuple`:
545
+ [`~pipelines.cogview4.pipeline_CogView4.CogView4PipelineOutput`] if `return_dict` is True, otherwise a
546
+ `tuple`. When returning a tuple, the first element is a list with the generated images.
547
+ """
548
+
549
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
550
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
551
+
552
+ height = height or self.transformer.config.sample_size * self.vae_scale_factor
553
+ width = width or self.transformer.config.sample_size * self.vae_scale_factor
554
+
555
+ original_size = original_size or (height, width)
556
+ target_size = (height, width)
557
+
558
+ # Check inputs. Raise error if not correct
559
+ self.check_inputs(
560
+ prompt,
561
+ height,
562
+ width,
563
+ negative_prompt,
564
+ callback_on_step_end_tensor_inputs,
565
+ prompt_embeds,
566
+ negative_prompt_embeds,
567
+ )
568
+ self._guidance_scale = guidance_scale
569
+ self._attention_kwargs = attention_kwargs
570
+ self._current_timestep = None
571
+ self._interrupt = False
572
+
573
+ # Default call parameters
574
+ if prompt is not None and isinstance(prompt, str):
575
+ batch_size = 1
576
+ elif prompt is not None and isinstance(prompt, list):
577
+ batch_size = len(prompt)
578
+ else:
579
+ batch_size = prompt_embeds.shape[0]
580
+
581
+ device = self._execution_device
582
+
583
+ # Encode input prompt
584
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
585
+ prompt,
586
+ negative_prompt,
587
+ self.do_classifier_free_guidance,
588
+ num_images_per_prompt=num_images_per_prompt,
589
+ prompt_embeds=prompt_embeds,
590
+ negative_prompt_embeds=negative_prompt_embeds,
591
+ max_sequence_length=max_sequence_length,
592
+ device=device,
593
+ )
594
+
595
+ # Prepare latents
596
+ latent_channels = self.transformer.config.in_channels // 2
597
+
598
+ control_image = self.prepare_image(
599
+ image=control_image,
600
+ width=width,
601
+ height=height,
602
+ batch_size=batch_size * num_images_per_prompt,
603
+ num_images_per_prompt=num_images_per_prompt,
604
+ device=device,
605
+ dtype=self.vae.dtype,
606
+ )
607
+ height, width = control_image.shape[-2:]
608
+
609
+ vae_shift_factor = 0
610
+
611
+ control_image = self.vae.encode(control_image).latent_dist.sample()
612
+ control_image = (control_image - vae_shift_factor) * self.vae.config.scaling_factor
613
+
614
+ latents = self.prepare_latents(
615
+ batch_size * num_images_per_prompt,
616
+ latent_channels,
617
+ height,
618
+ width,
619
+ torch.float32,
620
+ device,
621
+ generator,
622
+ latents,
623
+ )
624
+
625
+ # Prepare additional timestep conditions
626
+ original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype, device=device)
627
+ target_size = torch.tensor([target_size], dtype=prompt_embeds.dtype, device=device)
628
+ crops_coords_top_left = torch.tensor([crops_coords_top_left], dtype=prompt_embeds.dtype, device=device)
629
+
630
+ original_size = original_size.repeat(batch_size * num_images_per_prompt, 1)
631
+ target_size = target_size.repeat(batch_size * num_images_per_prompt, 1)
632
+ crops_coords_top_left = crops_coords_top_left.repeat(batch_size * num_images_per_prompt, 1)
633
+
634
+ # Prepare timesteps
635
+ image_seq_len = ((height // self.vae_scale_factor) * (width // self.vae_scale_factor)) // (
636
+ self.transformer.config.patch_size**2
637
+ )
638
+
639
+ timesteps = (
640
+ np.linspace(self.scheduler.config.num_train_timesteps, 1.0, num_inference_steps)
641
+ if timesteps is None
642
+ else np.array(timesteps)
643
+ )
644
+ timesteps = timesteps.astype(np.int64).astype(np.float32)
645
+ sigmas = timesteps / self.scheduler.config.num_train_timesteps if sigmas is None else sigmas
646
+ mu = calculate_shift(
647
+ image_seq_len,
648
+ self.scheduler.config.get("base_image_seq_len", 256),
649
+ self.scheduler.config.get("base_shift", 0.25),
650
+ self.scheduler.config.get("max_shift", 0.75),
651
+ )
652
+ timesteps, num_inference_steps = retrieve_timesteps(
653
+ self.scheduler, num_inference_steps, device, timesteps, sigmas, mu=mu
654
+ )
655
+ self._num_timesteps = len(timesteps)
656
+ # Denoising loop
657
+ transformer_dtype = self.transformer.dtype
658
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
659
+
660
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
661
+ for i, t in enumerate(timesteps):
662
+ if self.interrupt:
663
+ continue
664
+
665
+ self._current_timestep = t
666
+ latent_model_input = torch.cat([latents, control_image], dim=1).to(transformer_dtype)
667
+
668
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
669
+ timestep = t.expand(latents.shape[0])
670
+
671
+ noise_pred_cond = self.transformer(
672
+ hidden_states=latent_model_input,
673
+ encoder_hidden_states=prompt_embeds,
674
+ timestep=timestep,
675
+ original_size=original_size,
676
+ target_size=target_size,
677
+ crop_coords=crops_coords_top_left,
678
+ attention_kwargs=attention_kwargs,
679
+ return_dict=False,
680
+ )[0]
681
+
682
+ # perform guidance
683
+ if self.do_classifier_free_guidance:
684
+ noise_pred_uncond = self.transformer(
685
+ hidden_states=latent_model_input,
686
+ encoder_hidden_states=negative_prompt_embeds,
687
+ timestep=timestep,
688
+ original_size=original_size,
689
+ target_size=target_size,
690
+ crop_coords=crops_coords_top_left,
691
+ attention_kwargs=attention_kwargs,
692
+ return_dict=False,
693
+ )[0]
694
+
695
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
696
+ else:
697
+ noise_pred = noise_pred_cond
698
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
699
+
700
+ # call the callback, if provided
701
+ if callback_on_step_end is not None:
702
+ callback_kwargs = {}
703
+ for k in callback_on_step_end_tensor_inputs:
704
+ callback_kwargs[k] = locals()[k]
705
+ callback_outputs = callback_on_step_end(self, i, self.scheduler.sigmas[i], callback_kwargs)
706
+ latents = callback_outputs.pop("latents", latents)
707
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
708
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
709
+
710
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
711
+ progress_bar.update()
712
+
713
+ if XLA_AVAILABLE:
714
+ xm.mark_step()
715
+
716
+ self._current_timestep = None
717
+
718
+ if not output_type == "latent":
719
+ latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
720
+ image = self.vae.decode(latents, return_dict=False, generator=generator)[0]
721
+ else:
722
+ image = latents
723
+
724
+ image = self.image_processor.postprocess(image, output_type=output_type)
725
+
726
+ # Offload all models
727
+ self.maybe_free_model_hooks()
728
+
729
+ if not return_dict:
730
+ return (image,)
731
+
732
+ return CogView4PipelineOutput(images=image)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/cogview4/pipeline_output.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Union
3
+
4
+ import numpy as np
5
+ import PIL.Image
6
+
7
+ from ...utils import BaseOutput
8
+
9
+
10
+ @dataclass
11
+ class CogView4PipelineOutput(BaseOutput):
12
+ """
13
+ Output class for CogView3 pipelines.
14
+
15
+ Args:
16
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
17
+ List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
18
+ num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
19
+ """
20
+
21
+ images: Union[List[PIL.Image.Image], np.ndarray]
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/consisid/consisid_utils.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import os
3
+
4
+ import cv2
5
+ import numpy as np
6
+ import torch
7
+ from PIL import Image, ImageOps
8
+ from torchvision.transforms import InterpolationMode
9
+ from torchvision.transforms.functional import normalize, resize
10
+
11
+ from ...utils import get_logger, load_image
12
+
13
+
14
+ logger = get_logger(__name__)
15
+
16
+ _insightface_available = importlib.util.find_spec("insightface") is not None
17
+ _consisid_eva_clip_available = importlib.util.find_spec("consisid_eva_clip") is not None
18
+ _facexlib_available = importlib.util.find_spec("facexlib") is not None
19
+
20
+ if _insightface_available:
21
+ import insightface
22
+ from insightface.app import FaceAnalysis
23
+ else:
24
+ raise ImportError("insightface is not available. Please install it using 'pip install insightface'.")
25
+
26
+ if _consisid_eva_clip_available:
27
+ from consisid_eva_clip import create_model_and_transforms
28
+ from consisid_eva_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
29
+ else:
30
+ raise ImportError("consisid_eva_clip is not available. Please install it using 'pip install consisid_eva_clip'.")
31
+
32
+ if _facexlib_available:
33
+ from facexlib.parsing import init_parsing_model
34
+ from facexlib.utils.face_restoration_helper import FaceRestoreHelper
35
+ else:
36
+ raise ImportError("facexlib is not available. Please install it using 'pip install facexlib'.")
37
+
38
+
39
+ def resize_numpy_image_long(image, resize_long_edge=768):
40
+ """
41
+ Resize the input image to a specified long edge while maintaining aspect ratio.
42
+
43
+ Args:
44
+ image (numpy.ndarray): Input image (H x W x C or H x W).
45
+ resize_long_edge (int): The target size for the long edge of the image. Default is 768.
46
+
47
+ Returns:
48
+ numpy.ndarray: Resized image with the long edge matching `resize_long_edge`, while maintaining the aspect
49
+ ratio.
50
+ """
51
+
52
+ h, w = image.shape[:2]
53
+ if max(h, w) <= resize_long_edge:
54
+ return image
55
+ k = resize_long_edge / max(h, w)
56
+ h = int(h * k)
57
+ w = int(w * k)
58
+ image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4)
59
+ return image
60
+
61
+
62
+ def img2tensor(imgs, bgr2rgb=True, float32=True):
63
+ """Numpy array to tensor.
64
+
65
+ Args:
66
+ imgs (list[ndarray] | ndarray): Input images.
67
+ bgr2rgb (bool): Whether to change bgr to rgb.
68
+ float32 (bool): Whether to change to float32.
69
+
70
+ Returns:
71
+ list[tensor] | tensor: Tensor images. If returned results only have
72
+ one element, just return tensor.
73
+ """
74
+
75
+ def _totensor(img, bgr2rgb, float32):
76
+ if img.shape[2] == 3 and bgr2rgb:
77
+ if img.dtype == "float64":
78
+ img = img.astype("float32")
79
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
80
+ img = torch.from_numpy(img.transpose(2, 0, 1))
81
+ if float32:
82
+ img = img.float()
83
+ return img
84
+
85
+ if isinstance(imgs, list):
86
+ return [_totensor(img, bgr2rgb, float32) for img in imgs]
87
+ return _totensor(imgs, bgr2rgb, float32)
88
+
89
+
90
+ def to_gray(img):
91
+ """
92
+ Converts an RGB image to grayscale by applying the standard luminosity formula.
93
+
94
+ Args:
95
+ img (torch.Tensor): The input image tensor with shape (batch_size, channels, height, width).
96
+ The image is expected to be in RGB format (3 channels).
97
+
98
+ Returns:
99
+ torch.Tensor: The grayscale image tensor with shape (batch_size, 3, height, width).
100
+ The grayscale values are replicated across all three channels.
101
+ """
102
+ x = 0.299 * img[:, 0:1] + 0.587 * img[:, 1:2] + 0.114 * img[:, 2:3]
103
+ x = x.repeat(1, 3, 1, 1)
104
+ return x
105
+
106
+
107
+ def process_face_embeddings(
108
+ face_helper_1,
109
+ clip_vision_model,
110
+ face_helper_2,
111
+ eva_transform_mean,
112
+ eva_transform_std,
113
+ app,
114
+ device,
115
+ weight_dtype,
116
+ image,
117
+ original_id_image=None,
118
+ is_align_face=True,
119
+ ):
120
+ """
121
+ Process face embeddings from an image, extracting relevant features such as face embeddings, landmarks, and parsed
122
+ face features using a series of face detection and alignment tools.
123
+
124
+ Args:
125
+ face_helper_1: Face helper object (first helper) for alignment and landmark detection.
126
+ clip_vision_model: Pre-trained CLIP vision model used for feature extraction.
127
+ face_helper_2: Face helper object (second helper) for embedding extraction.
128
+ eva_transform_mean: Mean values for image normalization before passing to EVA model.
129
+ eva_transform_std: Standard deviation values for image normalization before passing to EVA model.
130
+ app: Application instance used for face detection.
131
+ device: Device (CPU or GPU) where the computations will be performed.
132
+ weight_dtype: Data type of the weights for precision (e.g., `torch.float32`).
133
+ image: Input image in RGB format with pixel values in the range [0, 255].
134
+ original_id_image: (Optional) Original image for feature extraction if `is_align_face` is False.
135
+ is_align_face: Boolean flag indicating whether face alignment should be performed.
136
+
137
+ Returns:
138
+ Tuple:
139
+ - id_cond: Concatenated tensor of Ante face embedding and CLIP vision embedding
140
+ - id_vit_hidden: Hidden state of the CLIP vision model, a list of tensors.
141
+ - return_face_features_image_2: Processed face features image after normalization and parsing.
142
+ - face_kps: Keypoints of the face detected in the image.
143
+ """
144
+
145
+ face_helper_1.clean_all()
146
+ image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
147
+ # get antelopev2 embedding
148
+ face_info = app.get(image_bgr)
149
+ if len(face_info) > 0:
150
+ face_info = sorted(face_info, key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1]))[
151
+ -1
152
+ ] # only use the maximum face
153
+ id_ante_embedding = face_info["embedding"] # (512,)
154
+ face_kps = face_info["kps"]
155
+ else:
156
+ id_ante_embedding = None
157
+ face_kps = None
158
+
159
+ # using facexlib to detect and align face
160
+ face_helper_1.read_image(image_bgr)
161
+ face_helper_1.get_face_landmarks_5(only_center_face=True)
162
+ if face_kps is None:
163
+ face_kps = face_helper_1.all_landmarks_5[0]
164
+ face_helper_1.align_warp_face()
165
+ if len(face_helper_1.cropped_faces) == 0:
166
+ raise RuntimeError("facexlib align face fail")
167
+ align_face = face_helper_1.cropped_faces[0] # (512, 512, 3) # RGB
168
+
169
+ # in case insightface didn't detect face
170
+ if id_ante_embedding is None:
171
+ logger.warning("Failed to detect face using insightface. Extracting embedding with align face")
172
+ id_ante_embedding = face_helper_2.get_feat(align_face)
173
+
174
+ id_ante_embedding = torch.from_numpy(id_ante_embedding).to(device, weight_dtype) # torch.Size([512])
175
+ if id_ante_embedding.ndim == 1:
176
+ id_ante_embedding = id_ante_embedding.unsqueeze(0) # torch.Size([1, 512])
177
+
178
+ # parsing
179
+ if is_align_face:
180
+ input = img2tensor(align_face, bgr2rgb=True).unsqueeze(0) / 255.0 # torch.Size([1, 3, 512, 512])
181
+ input = input.to(device)
182
+ parsing_out = face_helper_1.face_parse(normalize(input, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))[0]
183
+ parsing_out = parsing_out.argmax(dim=1, keepdim=True) # torch.Size([1, 1, 512, 512])
184
+ bg_label = [0, 16, 18, 7, 8, 9, 14, 15]
185
+ bg = sum(parsing_out == i for i in bg_label).bool()
186
+ white_image = torch.ones_like(input) # torch.Size([1, 3, 512, 512])
187
+ # only keep the face features
188
+ return_face_features_image = torch.where(bg, white_image, to_gray(input)) # torch.Size([1, 3, 512, 512])
189
+ return_face_features_image_2 = torch.where(bg, white_image, input) # torch.Size([1, 3, 512, 512])
190
+ else:
191
+ original_image_bgr = cv2.cvtColor(original_id_image, cv2.COLOR_RGB2BGR)
192
+ input = img2tensor(original_image_bgr, bgr2rgb=True).unsqueeze(0) / 255.0 # torch.Size([1, 3, 512, 512])
193
+ input = input.to(device)
194
+ return_face_features_image = return_face_features_image_2 = input
195
+
196
+ # transform img before sending to eva-clip-vit
197
+ face_features_image = resize(
198
+ return_face_features_image, clip_vision_model.image_size, InterpolationMode.BICUBIC
199
+ ) # torch.Size([1, 3, 336, 336])
200
+ face_features_image = normalize(face_features_image, eva_transform_mean, eva_transform_std)
201
+ id_cond_vit, id_vit_hidden = clip_vision_model(
202
+ face_features_image.to(weight_dtype), return_all_features=False, return_hidden=True, shuffle=False
203
+ ) # torch.Size([1, 768]), list(torch.Size([1, 577, 1024]))
204
+ id_cond_vit_norm = torch.norm(id_cond_vit, 2, 1, True)
205
+ id_cond_vit = torch.div(id_cond_vit, id_cond_vit_norm)
206
+
207
+ id_cond = torch.cat(
208
+ [id_ante_embedding, id_cond_vit], dim=-1
209
+ ) # torch.Size([1, 512]), torch.Size([1, 768]) -> torch.Size([1, 1280])
210
+
211
+ return (
212
+ id_cond,
213
+ id_vit_hidden,
214
+ return_face_features_image_2,
215
+ face_kps,
216
+ ) # torch.Size([1, 1280]), list(torch.Size([1, 577, 1024]))
217
+
218
+
219
+ def process_face_embeddings_infer(
220
+ face_helper_1,
221
+ clip_vision_model,
222
+ face_helper_2,
223
+ eva_transform_mean,
224
+ eva_transform_std,
225
+ app,
226
+ device,
227
+ weight_dtype,
228
+ img_file_path,
229
+ is_align_face=True,
230
+ ):
231
+ """
232
+ Process face embeddings from an input image for inference, including alignment, feature extraction, and embedding
233
+ concatenation.
234
+
235
+ Args:
236
+ face_helper_1: Face helper object (first helper) for alignment and landmark detection.
237
+ clip_vision_model: Pre-trained CLIP vision model used for feature extraction.
238
+ face_helper_2: Face helper object (second helper) for embedding extraction.
239
+ eva_transform_mean: Mean values for image normalization before passing to EVA model.
240
+ eva_transform_std: Standard deviation values for image normalization before passing to EVA model.
241
+ app: Application instance used for face detection.
242
+ device: Device (CPU or GPU) where the computations will be performed.
243
+ weight_dtype: Data type of the weights for precision (e.g., `torch.float32`).
244
+ img_file_path: Path to the input image file (string) or a numpy array representing an image.
245
+ is_align_face: Boolean flag indicating whether face alignment should be performed (default: True).
246
+
247
+ Returns:
248
+ Tuple:
249
+ - id_cond: Concatenated tensor of Ante face embedding and CLIP vision embedding.
250
+ - id_vit_hidden: Hidden state of the CLIP vision model, a list of tensors.
251
+ - image: Processed face image after feature extraction and alignment.
252
+ - face_kps: Keypoints of the face detected in the image.
253
+ """
254
+
255
+ # Load and preprocess the input image
256
+ if isinstance(img_file_path, str):
257
+ image = np.array(load_image(image=img_file_path).convert("RGB"))
258
+ else:
259
+ image = np.array(ImageOps.exif_transpose(Image.fromarray(img_file_path)).convert("RGB"))
260
+
261
+ # Resize image to ensure the longer side is 1024 pixels
262
+ image = resize_numpy_image_long(image, 1024)
263
+ original_id_image = image
264
+
265
+ # Process the image to extract face embeddings and related features
266
+ id_cond, id_vit_hidden, align_crop_face_image, face_kps = process_face_embeddings(
267
+ face_helper_1,
268
+ clip_vision_model,
269
+ face_helper_2,
270
+ eva_transform_mean,
271
+ eva_transform_std,
272
+ app,
273
+ device,
274
+ weight_dtype,
275
+ image,
276
+ original_id_image,
277
+ is_align_face,
278
+ )
279
+
280
+ # Convert the aligned cropped face image (torch tensor) to a numpy array
281
+ tensor = align_crop_face_image.cpu().detach()
282
+ tensor = tensor.squeeze()
283
+ tensor = tensor.permute(1, 2, 0)
284
+ tensor = tensor.numpy() * 255
285
+ tensor = tensor.astype(np.uint8)
286
+ image = ImageOps.exif_transpose(Image.fromarray(tensor))
287
+
288
+ return id_cond, id_vit_hidden, image, face_kps
289
+
290
+
291
+ def prepare_face_models(model_path, device, dtype):
292
+ """
293
+ Prepare all face models for the facial recognition task.
294
+
295
+ Parameters:
296
+ - model_path: Path to the directory containing model files.
297
+ - device: The device (e.g., 'cuda', 'xpu', 'cpu') where models will be loaded.
298
+ - dtype: Data type (e.g., torch.float32) for model inference.
299
+
300
+ Returns:
301
+ - face_helper_1: First face restoration helper.
302
+ - face_helper_2: Second face restoration helper.
303
+ - face_clip_model: CLIP model for face extraction.
304
+ - eva_transform_mean: Mean value for image normalization.
305
+ - eva_transform_std: Standard deviation value for image normalization.
306
+ - face_main_model: Main face analysis model.
307
+ """
308
+ # get helper model
309
+ face_helper_1 = FaceRestoreHelper(
310
+ upscale_factor=1,
311
+ face_size=512,
312
+ crop_ratio=(1, 1),
313
+ det_model="retinaface_resnet50",
314
+ save_ext="png",
315
+ device=device,
316
+ model_rootpath=os.path.join(model_path, "face_encoder"),
317
+ )
318
+ face_helper_1.face_parse = None
319
+ face_helper_1.face_parse = init_parsing_model(
320
+ model_name="bisenet", device=device, model_rootpath=os.path.join(model_path, "face_encoder")
321
+ )
322
+ face_helper_2 = insightface.model_zoo.get_model(
323
+ f"{model_path}/face_encoder/models/antelopev2/glintr100.onnx", providers=["CUDAExecutionProvider"]
324
+ )
325
+ face_helper_2.prepare(ctx_id=0)
326
+
327
+ # get local facial extractor part 1
328
+ model, _, _ = create_model_and_transforms(
329
+ "EVA02-CLIP-L-14-336",
330
+ os.path.join(model_path, "face_encoder", "EVA02_CLIP_L_336_psz14_s6B.pt"),
331
+ force_custom_clip=True,
332
+ )
333
+ face_clip_model = model.visual
334
+ eva_transform_mean = getattr(face_clip_model, "image_mean", OPENAI_DATASET_MEAN)
335
+ eva_transform_std = getattr(face_clip_model, "image_std", OPENAI_DATASET_STD)
336
+ if not isinstance(eva_transform_mean, (list, tuple)):
337
+ eva_transform_mean = (eva_transform_mean,) * 3
338
+ if not isinstance(eva_transform_std, (list, tuple)):
339
+ eva_transform_std = (eva_transform_std,) * 3
340
+ eva_transform_mean = eva_transform_mean
341
+ eva_transform_std = eva_transform_std
342
+
343
+ # get local facial extractor part 2
344
+ face_main_model = FaceAnalysis(
345
+ name="antelopev2", root=os.path.join(model_path, "face_encoder"), providers=["CUDAExecutionProvider"]
346
+ )
347
+ face_main_model.prepare(ctx_id=0, det_size=(640, 640))
348
+
349
+ # move face models to device
350
+ face_helper_1.face_det.eval()
351
+ face_helper_1.face_parse.eval()
352
+ face_clip_model.eval()
353
+ face_helper_1.face_det.to(device)
354
+ face_helper_1.face_parse.to(device)
355
+ face_clip_model.to(device, dtype=dtype)
356
+
357
+ return face_helper_1, face_helper_2, face_clip_model, face_main_model, eva_transform_mean, eva_transform_std
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/consisid/pipeline_consisid.py ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 ConsisID Authors and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ import math
17
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
18
+
19
+ import numpy as np
20
+ import PIL
21
+ import torch
22
+ from transformers import T5EncoderModel, T5Tokenizer
23
+
24
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
25
+ from ...image_processor import PipelineImageInput
26
+ from ...loaders import CogVideoXLoraLoaderMixin
27
+ from ...models import AutoencoderKLCogVideoX, ConsisIDTransformer3DModel
28
+ from ...models.embeddings import get_3d_rotary_pos_embed
29
+ from ...pipelines.pipeline_utils import DiffusionPipeline
30
+ from ...schedulers import CogVideoXDPMScheduler
31
+ from ...utils import is_opencv_available, logging, replace_example_docstring
32
+ from ...utils.torch_utils import randn_tensor
33
+ from ...video_processor import VideoProcessor
34
+ from .pipeline_output import ConsisIDPipelineOutput
35
+
36
+
37
+ if is_opencv_available():
38
+ import cv2
39
+
40
+
41
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
42
+
43
+
44
+ EXAMPLE_DOC_STRING = """
45
+ Examples:
46
+ ```python
47
+ >>> import torch
48
+ >>> from diffusers import ConsisIDPipeline
49
+ >>> from diffusers.pipelines.consisid.consisid_utils import prepare_face_models, process_face_embeddings_infer
50
+ >>> from diffusers.utils import export_to_video
51
+ >>> from huggingface_hub import snapshot_download
52
+
53
+ >>> snapshot_download(repo_id="BestWishYsh/ConsisID-preview", local_dir="BestWishYsh/ConsisID-preview")
54
+ >>> (
55
+ ... face_helper_1,
56
+ ... face_helper_2,
57
+ ... face_clip_model,
58
+ ... face_main_model,
59
+ ... eva_transform_mean,
60
+ ... eva_transform_std,
61
+ ... ) = prepare_face_models("BestWishYsh/ConsisID-preview", device="cuda", dtype=torch.bfloat16)
62
+ >>> pipe = ConsisIDPipeline.from_pretrained("BestWishYsh/ConsisID-preview", torch_dtype=torch.bfloat16)
63
+ >>> pipe.to("cuda")
64
+
65
+ >>> # ConsisID works well with long and well-described prompts. Make sure the face in the image is clearly visible (e.g., preferably half-body or full-body).
66
+ >>> prompt = "The video captures a boy walking along a city street, filmed in black and white on a classic 35mm camera. His expression is thoughtful, his brow slightly furrowed as if he's lost in contemplation. The film grain adds a textured, timeless quality to the image, evoking a sense of nostalgia. Around him, the cityscape is filled with vintage buildings, cobblestone sidewalks, and softly blurred figures passing by, their outlines faint and indistinct. Streetlights cast a gentle glow, while shadows play across the boy's path, adding depth to the scene. The lighting highlights the boy's subtle smile, hinting at a fleeting moment of curiosity. The overall cinematic atmosphere, complete with classic film still aesthetics and dramatic contrasts, gives the scene an evocative and introspective feel."
67
+ >>> image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_input.png?download=true"
68
+
69
+ >>> id_cond, id_vit_hidden, image, face_kps = process_face_embeddings_infer(
70
+ ... face_helper_1,
71
+ ... face_clip_model,
72
+ ... face_helper_2,
73
+ ... eva_transform_mean,
74
+ ... eva_transform_std,
75
+ ... face_main_model,
76
+ ... "cuda",
77
+ ... torch.bfloat16,
78
+ ... image,
79
+ ... is_align_face=True,
80
+ ... )
81
+
82
+ >>> video = pipe(
83
+ ... image=image,
84
+ ... prompt=prompt,
85
+ ... num_inference_steps=50,
86
+ ... guidance_scale=6.0,
87
+ ... use_dynamic_cfg=False,
88
+ ... id_vit_hidden=id_vit_hidden,
89
+ ... id_cond=id_cond,
90
+ ... kps_cond=face_kps,
91
+ ... generator=torch.Generator("cuda").manual_seed(42),
92
+ ... )
93
+ >>> export_to_video(video.frames[0], "output.mp4", fps=8)
94
+ ```
95
+ """
96
+
97
+
98
+ def draw_kps(image_pil, kps, color_list=[(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]):
99
+ """
100
+ This function draws keypoints and the limbs connecting them on an image.
101
+
102
+ Parameters:
103
+ - image_pil (PIL.Image): Input image as a PIL object.
104
+ - kps (list of tuples): A list of keypoints where each keypoint is a tuple of (x, y) coordinates.
105
+ - color_list (list of tuples, optional): List of colors (in RGB format) for each keypoint. Default is a set of five
106
+ colors.
107
+
108
+ Returns:
109
+ - PIL.Image: Image with the keypoints and limbs drawn.
110
+ """
111
+
112
+ stickwidth = 4
113
+ limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
114
+ kps = np.array(kps)
115
+
116
+ w, h = image_pil.size
117
+ out_img = np.zeros([h, w, 3])
118
+
119
+ for i in range(len(limbSeq)):
120
+ index = limbSeq[i]
121
+ color = color_list[index[0]]
122
+
123
+ x = kps[index][:, 0]
124
+ y = kps[index][:, 1]
125
+ length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
126
+ angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
127
+ polygon = cv2.ellipse2Poly(
128
+ (int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1
129
+ )
130
+ out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
131
+ out_img = (out_img * 0.6).astype(np.uint8)
132
+
133
+ for idx_kp, kp in enumerate(kps):
134
+ color = color_list[idx_kp]
135
+ x, y = kp
136
+ out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1)
137
+
138
+ out_img_pil = PIL.Image.fromarray(out_img.astype(np.uint8))
139
+ return out_img_pil
140
+
141
+
142
+ # Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
143
+ def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
144
+ """
145
+ This function calculates the resize and crop region for an image to fit a target width and height while preserving
146
+ the aspect ratio.
147
+
148
+ Parameters:
149
+ - src (tuple): A tuple containing the source image's height (h) and width (w).
150
+ - tgt_width (int): The target width to resize the image.
151
+ - tgt_height (int): The target height to resize the image.
152
+
153
+ Returns:
154
+ - tuple: Two tuples representing the crop region:
155
+ 1. The top-left coordinates of the crop region.
156
+ 2. The bottom-right coordinates of the crop region.
157
+ """
158
+
159
+ tw = tgt_width
160
+ th = tgt_height
161
+ h, w = src
162
+ r = h / w
163
+ if r > (th / tw):
164
+ resize_height = th
165
+ resize_width = int(round(th / h * w))
166
+ else:
167
+ resize_width = tw
168
+ resize_height = int(round(tw / w * h))
169
+
170
+ crop_top = int(round((th - resize_height) / 2.0))
171
+ crop_left = int(round((tw - resize_width) / 2.0))
172
+
173
+ return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
174
+
175
+
176
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
177
+ def retrieve_timesteps(
178
+ scheduler,
179
+ num_inference_steps: Optional[int] = None,
180
+ device: Optional[Union[str, torch.device]] = None,
181
+ timesteps: Optional[List[int]] = None,
182
+ sigmas: Optional[List[float]] = None,
183
+ **kwargs,
184
+ ):
185
+ r"""
186
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
187
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
188
+
189
+ Args:
190
+ scheduler (`SchedulerMixin`):
191
+ The scheduler to get timesteps from.
192
+ num_inference_steps (`int`):
193
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
194
+ must be `None`.
195
+ device (`str` or `torch.device`, *optional*):
196
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
197
+ timesteps (`List[int]`, *optional*):
198
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
199
+ `num_inference_steps` and `sigmas` must be `None`.
200
+ sigmas (`List[float]`, *optional*):
201
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
202
+ `num_inference_steps` and `timesteps` must be `None`.
203
+
204
+ Returns:
205
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
206
+ second element is the number of inference steps.
207
+ """
208
+ if timesteps is not None and sigmas is not None:
209
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
210
+ if timesteps is not None:
211
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
212
+ if not accepts_timesteps:
213
+ raise ValueError(
214
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
215
+ f" timestep schedules. Please check whether you are using the correct scheduler."
216
+ )
217
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
218
+ timesteps = scheduler.timesteps
219
+ num_inference_steps = len(timesteps)
220
+ elif sigmas is not None:
221
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
222
+ if not accept_sigmas:
223
+ raise ValueError(
224
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
225
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
226
+ )
227
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
228
+ timesteps = scheduler.timesteps
229
+ num_inference_steps = len(timesteps)
230
+ else:
231
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
232
+ timesteps = scheduler.timesteps
233
+ return timesteps, num_inference_steps
234
+
235
+
236
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
237
+ def retrieve_latents(
238
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
239
+ ):
240
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
241
+ return encoder_output.latent_dist.sample(generator)
242
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
243
+ return encoder_output.latent_dist.mode()
244
+ elif hasattr(encoder_output, "latents"):
245
+ return encoder_output.latents
246
+ else:
247
+ raise AttributeError("Could not access latents of provided encoder_output")
248
+
249
+
250
+ class ConsisIDPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
251
+ r"""
252
+ Pipeline for image-to-video generation using ConsisID.
253
+
254
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
255
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
256
+
257
+ Args:
258
+ vae ([`AutoencoderKL`]):
259
+ Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
260
+ text_encoder ([`T5EncoderModel`]):
261
+ Frozen text-encoder. ConsisID uses
262
+ [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
263
+ [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
264
+ tokenizer (`T5Tokenizer`):
265
+ Tokenizer of class
266
+ [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
267
+ transformer ([`ConsisIDTransformer3DModel`]):
268
+ A text conditioned `ConsisIDTransformer3DModel` to denoise the encoded video latents.
269
+ scheduler ([`SchedulerMixin`]):
270
+ A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
271
+ """
272
+
273
+ _optional_components = []
274
+ model_cpu_offload_seq = "text_encoder->transformer->vae"
275
+
276
+ _callback_tensor_inputs = [
277
+ "latents",
278
+ "prompt_embeds",
279
+ "negative_prompt_embeds",
280
+ ]
281
+
282
+ def __init__(
283
+ self,
284
+ tokenizer: T5Tokenizer,
285
+ text_encoder: T5EncoderModel,
286
+ vae: AutoencoderKLCogVideoX,
287
+ transformer: ConsisIDTransformer3DModel,
288
+ scheduler: CogVideoXDPMScheduler,
289
+ ):
290
+ super().__init__()
291
+
292
+ self.register_modules(
293
+ tokenizer=tokenizer,
294
+ text_encoder=text_encoder,
295
+ vae=vae,
296
+ transformer=transformer,
297
+ scheduler=scheduler,
298
+ )
299
+ self.vae_scale_factor_spatial = (
300
+ 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
301
+ )
302
+ self.vae_scale_factor_temporal = (
303
+ self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
304
+ )
305
+ self.vae_scaling_factor_image = (
306
+ self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
307
+ )
308
+
309
+ self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
310
+
311
+ # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds
312
+ def _get_t5_prompt_embeds(
313
+ self,
314
+ prompt: Union[str, List[str]] = None,
315
+ num_videos_per_prompt: int = 1,
316
+ max_sequence_length: int = 226,
317
+ device: Optional[torch.device] = None,
318
+ dtype: Optional[torch.dtype] = None,
319
+ ):
320
+ device = device or self._execution_device
321
+ dtype = dtype or self.text_encoder.dtype
322
+
323
+ prompt = [prompt] if isinstance(prompt, str) else prompt
324
+ batch_size = len(prompt)
325
+
326
+ text_inputs = self.tokenizer(
327
+ prompt,
328
+ padding="max_length",
329
+ max_length=max_sequence_length,
330
+ truncation=True,
331
+ add_special_tokens=True,
332
+ return_tensors="pt",
333
+ )
334
+ text_input_ids = text_inputs.input_ids
335
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
336
+
337
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
338
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
339
+ logger.warning(
340
+ "The following part of your input was truncated because `max_sequence_length` is set to "
341
+ f" {max_sequence_length} tokens: {removed_text}"
342
+ )
343
+
344
+ prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
345
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
346
+
347
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
348
+ _, seq_len, _ = prompt_embeds.shape
349
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
350
+ prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
351
+
352
+ return prompt_embeds
353
+
354
+ # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.encode_prompt
355
+ def encode_prompt(
356
+ self,
357
+ prompt: Union[str, List[str]],
358
+ negative_prompt: Optional[Union[str, List[str]]] = None,
359
+ do_classifier_free_guidance: bool = True,
360
+ num_videos_per_prompt: int = 1,
361
+ prompt_embeds: Optional[torch.Tensor] = None,
362
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
363
+ max_sequence_length: int = 226,
364
+ device: Optional[torch.device] = None,
365
+ dtype: Optional[torch.dtype] = None,
366
+ ):
367
+ r"""
368
+ Encodes the prompt into text encoder hidden states.
369
+
370
+ Args:
371
+ prompt (`str` or `List[str]`, *optional*):
372
+ prompt to be encoded
373
+ negative_prompt (`str` or `List[str]`, *optional*):
374
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
375
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
376
+ less than `1`).
377
+ do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
378
+ Whether to use classifier free guidance or not.
379
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
380
+ Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
381
+ prompt_embeds (`torch.Tensor`, *optional*):
382
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
383
+ provided, text embeddings will be generated from `prompt` input argument.
384
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
385
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
386
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
387
+ argument.
388
+ device: (`torch.device`, *optional*):
389
+ torch device
390
+ dtype: (`torch.dtype`, *optional*):
391
+ torch dtype
392
+ """
393
+ device = device or self._execution_device
394
+
395
+ prompt = [prompt] if isinstance(prompt, str) else prompt
396
+ if prompt is not None:
397
+ batch_size = len(prompt)
398
+ else:
399
+ batch_size = prompt_embeds.shape[0]
400
+
401
+ if prompt_embeds is None:
402
+ prompt_embeds = self._get_t5_prompt_embeds(
403
+ prompt=prompt,
404
+ num_videos_per_prompt=num_videos_per_prompt,
405
+ max_sequence_length=max_sequence_length,
406
+ device=device,
407
+ dtype=dtype,
408
+ )
409
+
410
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
411
+ negative_prompt = negative_prompt or ""
412
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
413
+
414
+ if prompt is not None and type(prompt) is not type(negative_prompt):
415
+ raise TypeError(
416
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
417
+ f" {type(prompt)}."
418
+ )
419
+ elif batch_size != len(negative_prompt):
420
+ raise ValueError(
421
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
422
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
423
+ " the batch size of `prompt`."
424
+ )
425
+
426
+ negative_prompt_embeds = self._get_t5_prompt_embeds(
427
+ prompt=negative_prompt,
428
+ num_videos_per_prompt=num_videos_per_prompt,
429
+ max_sequence_length=max_sequence_length,
430
+ device=device,
431
+ dtype=dtype,
432
+ )
433
+
434
+ return prompt_embeds, negative_prompt_embeds
435
+
436
+ def prepare_latents(
437
+ self,
438
+ image: torch.Tensor,
439
+ batch_size: int = 1,
440
+ num_channels_latents: int = 16,
441
+ num_frames: int = 13,
442
+ height: int = 60,
443
+ width: int = 90,
444
+ dtype: Optional[torch.dtype] = None,
445
+ device: Optional[torch.device] = None,
446
+ generator: Optional[torch.Generator] = None,
447
+ latents: Optional[torch.Tensor] = None,
448
+ kps_cond: Optional[torch.Tensor] = None,
449
+ ):
450
+ if isinstance(generator, list) and len(generator) != batch_size:
451
+ raise ValueError(
452
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
453
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
454
+ )
455
+
456
+ num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
457
+ shape = (
458
+ batch_size,
459
+ num_frames,
460
+ num_channels_latents,
461
+ height // self.vae_scale_factor_spatial,
462
+ width // self.vae_scale_factor_spatial,
463
+ )
464
+
465
+ image = image.unsqueeze(2) # [B, C, F, H, W]
466
+
467
+ if isinstance(generator, list):
468
+ image_latents = [
469
+ retrieve_latents(self.vae.encode(image[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
470
+ ]
471
+ if kps_cond is not None:
472
+ kps_cond = kps_cond.unsqueeze(2)
473
+ kps_cond_latents = [
474
+ retrieve_latents(self.vae.encode(kps_cond[i].unsqueeze(0)), generator[i])
475
+ for i in range(batch_size)
476
+ ]
477
+ else:
478
+ image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
479
+ if kps_cond is not None:
480
+ kps_cond = kps_cond.unsqueeze(2)
481
+ kps_cond_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in kps_cond]
482
+
483
+ image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
484
+ image_latents = self.vae_scaling_factor_image * image_latents
485
+
486
+ if kps_cond is not None:
487
+ kps_cond_latents = torch.cat(kps_cond_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
488
+ kps_cond_latents = self.vae_scaling_factor_image * kps_cond_latents
489
+
490
+ padding_shape = (
491
+ batch_size,
492
+ num_frames - 2,
493
+ num_channels_latents,
494
+ height // self.vae_scale_factor_spatial,
495
+ width // self.vae_scale_factor_spatial,
496
+ )
497
+ else:
498
+ padding_shape = (
499
+ batch_size,
500
+ num_frames - 1,
501
+ num_channels_latents,
502
+ height // self.vae_scale_factor_spatial,
503
+ width // self.vae_scale_factor_spatial,
504
+ )
505
+
506
+ latent_padding = torch.zeros(padding_shape, device=device, dtype=dtype)
507
+ if kps_cond is not None:
508
+ image_latents = torch.cat([image_latents, kps_cond_latents, latent_padding], dim=1)
509
+ else:
510
+ image_latents = torch.cat([image_latents, latent_padding], dim=1)
511
+
512
+ if latents is None:
513
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
514
+ else:
515
+ latents = latents.to(device)
516
+
517
+ # scale the initial noise by the standard deviation required by the scheduler
518
+ latents = latents * self.scheduler.init_noise_sigma
519
+ return latents, image_latents
520
+
521
+ # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
522
+ def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
523
+ latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
524
+ latents = 1 / self.vae_scaling_factor_image * latents
525
+
526
+ frames = self.vae.decode(latents).sample
527
+ return frames
528
+
529
+ # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.get_timesteps
530
+ def get_timesteps(self, num_inference_steps, timesteps, strength, device):
531
+ # get the original timestep using init_timestep
532
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
533
+
534
+ t_start = max(num_inference_steps - init_timestep, 0)
535
+ timesteps = timesteps[t_start * self.scheduler.order :]
536
+
537
+ return timesteps, num_inference_steps - t_start
538
+
539
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
540
+ def prepare_extra_step_kwargs(self, generator, eta):
541
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
542
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
543
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
544
+ # and should be between [0, 1]
545
+
546
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
547
+ extra_step_kwargs = {}
548
+ if accepts_eta:
549
+ extra_step_kwargs["eta"] = eta
550
+
551
+ # check if the scheduler accepts generator
552
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
553
+ if accepts_generator:
554
+ extra_step_kwargs["generator"] = generator
555
+ return extra_step_kwargs
556
+
557
+ def check_inputs(
558
+ self,
559
+ image,
560
+ prompt,
561
+ height,
562
+ width,
563
+ negative_prompt,
564
+ callback_on_step_end_tensor_inputs,
565
+ latents=None,
566
+ prompt_embeds=None,
567
+ negative_prompt_embeds=None,
568
+ ):
569
+ if (
570
+ not isinstance(image, torch.Tensor)
571
+ and not isinstance(image, PIL.Image.Image)
572
+ and not isinstance(image, list)
573
+ ):
574
+ raise ValueError(
575
+ "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
576
+ f" {type(image)}"
577
+ )
578
+
579
+ if height % 8 != 0 or width % 8 != 0:
580
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
581
+
582
+ if callback_on_step_end_tensor_inputs is not None and not all(
583
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
584
+ ):
585
+ raise ValueError(
586
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
587
+ )
588
+ if prompt is not None and prompt_embeds is not None:
589
+ raise ValueError(
590
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
591
+ " only forward one of the two."
592
+ )
593
+ elif prompt is None and prompt_embeds is None:
594
+ raise ValueError(
595
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
596
+ )
597
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
598
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
599
+
600
+ if prompt is not None and negative_prompt_embeds is not None:
601
+ raise ValueError(
602
+ f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
603
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
604
+ )
605
+
606
+ if negative_prompt is not None and negative_prompt_embeds is not None:
607
+ raise ValueError(
608
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
609
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
610
+ )
611
+
612
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
613
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
614
+ raise ValueError(
615
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
616
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
617
+ f" {negative_prompt_embeds.shape}."
618
+ )
619
+
620
+ def _prepare_rotary_positional_embeddings(
621
+ self,
622
+ height: int,
623
+ width: int,
624
+ num_frames: int,
625
+ device: torch.device,
626
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
627
+ grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
628
+ grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
629
+ base_size_width = self.transformer.config.sample_width // self.transformer.config.patch_size
630
+ base_size_height = self.transformer.config.sample_height // self.transformer.config.patch_size
631
+
632
+ grid_crops_coords = get_resize_crop_region_for_grid(
633
+ (grid_height, grid_width), base_size_width, base_size_height
634
+ )
635
+ freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
636
+ embed_dim=self.transformer.config.attention_head_dim,
637
+ crops_coords=grid_crops_coords,
638
+ grid_size=(grid_height, grid_width),
639
+ temporal_size=num_frames,
640
+ device=device,
641
+ )
642
+
643
+ return freqs_cos, freqs_sin
644
+
645
+ @property
646
+ def guidance_scale(self):
647
+ return self._guidance_scale
648
+
649
+ @property
650
+ def num_timesteps(self):
651
+ return self._num_timesteps
652
+
653
+ @property
654
+ def attention_kwargs(self):
655
+ return self._attention_kwargs
656
+
657
+ @property
658
+ def interrupt(self):
659
+ return self._interrupt
660
+
661
+ @torch.no_grad()
662
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
663
+ def __call__(
664
+ self,
665
+ image: PipelineImageInput,
666
+ prompt: Optional[Union[str, List[str]]] = None,
667
+ negative_prompt: Optional[Union[str, List[str]]] = None,
668
+ height: int = 480,
669
+ width: int = 720,
670
+ num_frames: int = 49,
671
+ num_inference_steps: int = 50,
672
+ guidance_scale: float = 6.0,
673
+ use_dynamic_cfg: bool = False,
674
+ num_videos_per_prompt: int = 1,
675
+ eta: float = 0.0,
676
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
677
+ latents: Optional[torch.FloatTensor] = None,
678
+ prompt_embeds: Optional[torch.FloatTensor] = None,
679
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
680
+ output_type: str = "pil",
681
+ return_dict: bool = True,
682
+ attention_kwargs: Optional[Dict[str, Any]] = None,
683
+ callback_on_step_end: Optional[
684
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
685
+ ] = None,
686
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
687
+ max_sequence_length: int = 226,
688
+ id_vit_hidden: Optional[torch.Tensor] = None,
689
+ id_cond: Optional[torch.Tensor] = None,
690
+ kps_cond: Optional[torch.Tensor] = None,
691
+ ) -> Union[ConsisIDPipelineOutput, Tuple]:
692
+ """
693
+ Function invoked when calling the pipeline for generation.
694
+
695
+ Args:
696
+ image (`PipelineImageInput`):
697
+ The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
698
+ prompt (`str` or `List[str]`, *optional*):
699
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
700
+ instead.
701
+ negative_prompt (`str` or `List[str]`, *optional*):
702
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
703
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
704
+ less than `1`).
705
+ height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
706
+ The height in pixels of the generated image. This is set to 480 by default for the best results.
707
+ width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
708
+ The width in pixels of the generated image. This is set to 720 by default for the best results.
709
+ num_frames (`int`, defaults to `49`):
710
+ Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
711
+ contain 1 extra frame because ConsisID is conditioned with (num_seconds * fps + 1) frames where
712
+ num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
713
+ needs to be satisfied is that of divisibility mentioned above.
714
+ num_inference_steps (`int`, *optional*, defaults to 50):
715
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
716
+ expense of slower inference.
717
+ guidance_scale (`float`, *optional*, defaults to 6):
718
+ Guidance scale as defined in [Classifier-Free Diffusion
719
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
720
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
721
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
722
+ the text `prompt`, usually at the expense of lower image quality.
723
+ use_dynamic_cfg (`bool`, *optional*, defaults to `False`):
724
+ If True, dynamically adjusts the guidance scale during inference. This allows the model to use a
725
+ progressive guidance scale, improving the balance between text-guided generation and image quality over
726
+ the course of the inference steps. Typically, early inference steps use a higher guidance scale for
727
+ more faithful image generation, while later steps reduce it for more diverse and natural results.
728
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
729
+ The number of videos to generate per prompt.
730
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
731
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
732
+ to make generation deterministic.
733
+ latents (`torch.FloatTensor`, *optional*):
734
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
735
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
736
+ tensor will be generated by sampling using the supplied random `generator`.
737
+ prompt_embeds (`torch.FloatTensor`, *optional*):
738
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
739
+ provided, text embeddings will be generated from `prompt` input argument.
740
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
741
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
742
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
743
+ argument.
744
+ output_type (`str`, *optional*, defaults to `"pil"`):
745
+ The output format of the generate image. Choose between
746
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
747
+ return_dict (`bool`, *optional*, defaults to `True`):
748
+ Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
749
+ of a plain tuple.
750
+ attention_kwargs (`dict`, *optional*):
751
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
752
+ `self.processor` in
753
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
754
+ callback_on_step_end (`Callable`, *optional*):
755
+ A function that calls at the end of each denoising steps during the inference. The function is called
756
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
757
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
758
+ `callback_on_step_end_tensor_inputs`.
759
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
760
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
761
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
762
+ `._callback_tensor_inputs` attribute of your pipeline class.
763
+ max_sequence_length (`int`, defaults to `226`):
764
+ Maximum sequence length in encoded prompt. Must be consistent with
765
+ `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
766
+ id_vit_hidden (`Optional[torch.Tensor]`, *optional*):
767
+ The tensor representing the hidden features extracted from the face model, which are used to condition
768
+ the local facial extractor. This is crucial for the model to obtain high-frequency information of the
769
+ face. If not provided, the local facial extractor will not run normally.
770
+ id_cond (`Optional[torch.Tensor]`, *optional*):
771
+ The tensor representing the hidden features extracted from the clip model, which are used to condition
772
+ the local facial extractor. This is crucial for the model to edit facial features If not provided, the
773
+ local facial extractor will not run normally.
774
+ kps_cond (`Optional[torch.Tensor]`, *optional*):
775
+ A tensor that determines whether the global facial extractor use keypoint information for conditioning.
776
+ If provided, this tensor controls whether facial keypoints such as eyes, nose, and mouth landmarks are
777
+ used during the generation process. This helps ensure the model retains more facial low-frequency
778
+ information.
779
+
780
+ Examples:
781
+
782
+ Returns:
783
+ [`~pipelines.consisid.pipeline_output.ConsisIDPipelineOutput`] or `tuple`:
784
+ [`~pipelines.consisid.pipeline_output.ConsisIDPipelineOutput`] if `return_dict` is True, otherwise a
785
+ `tuple`. When returning a tuple, the first element is a list with the generated images.
786
+ """
787
+
788
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
789
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
790
+
791
+ height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
792
+ width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
793
+ num_frames = num_frames or self.transformer.config.sample_frames
794
+
795
+ num_videos_per_prompt = 1
796
+
797
+ # 1. Check inputs. Raise error if not correct
798
+ self.check_inputs(
799
+ image=image,
800
+ prompt=prompt,
801
+ height=height,
802
+ width=width,
803
+ negative_prompt=negative_prompt,
804
+ callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
805
+ latents=latents,
806
+ prompt_embeds=prompt_embeds,
807
+ negative_prompt_embeds=negative_prompt_embeds,
808
+ )
809
+ self._guidance_scale = guidance_scale
810
+ self._attention_kwargs = attention_kwargs
811
+ self._interrupt = False
812
+
813
+ # 2. Default call parameters
814
+ if prompt is not None and isinstance(prompt, str):
815
+ batch_size = 1
816
+ elif prompt is not None and isinstance(prompt, list):
817
+ batch_size = len(prompt)
818
+ else:
819
+ batch_size = prompt_embeds.shape[0]
820
+
821
+ device = self._execution_device
822
+
823
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
824
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
825
+ # corresponds to doing no classifier free guidance.
826
+ do_classifier_free_guidance = guidance_scale > 1.0
827
+
828
+ # 3. Encode input prompt
829
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
830
+ prompt=prompt,
831
+ negative_prompt=negative_prompt,
832
+ do_classifier_free_guidance=do_classifier_free_guidance,
833
+ num_videos_per_prompt=num_videos_per_prompt,
834
+ prompt_embeds=prompt_embeds,
835
+ negative_prompt_embeds=negative_prompt_embeds,
836
+ max_sequence_length=max_sequence_length,
837
+ device=device,
838
+ )
839
+ if do_classifier_free_guidance:
840
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
841
+
842
+ # 4. Prepare timesteps
843
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device)
844
+ self._num_timesteps = len(timesteps)
845
+
846
+ # 5. Prepare latents
847
+ is_kps = getattr(self.transformer.config, "is_kps", False)
848
+ kps_cond = kps_cond if is_kps else None
849
+ if kps_cond is not None:
850
+ kps_cond = draw_kps(image, kps_cond)
851
+ kps_cond = self.video_processor.preprocess(kps_cond, height=height, width=width).to(
852
+ device, dtype=prompt_embeds.dtype
853
+ )
854
+
855
+ image = self.video_processor.preprocess(image, height=height, width=width).to(
856
+ device, dtype=prompt_embeds.dtype
857
+ )
858
+
859
+ latent_channels = self.transformer.config.in_channels // 2
860
+ latents, image_latents = self.prepare_latents(
861
+ image,
862
+ batch_size * num_videos_per_prompt,
863
+ latent_channels,
864
+ num_frames,
865
+ height,
866
+ width,
867
+ prompt_embeds.dtype,
868
+ device,
869
+ generator,
870
+ latents,
871
+ kps_cond,
872
+ )
873
+
874
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
875
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
876
+
877
+ # 7. Create rotary embeds if required
878
+ image_rotary_emb = (
879
+ self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
880
+ if self.transformer.config.use_rotary_positional_embeddings
881
+ else None
882
+ )
883
+
884
+ # 8. Denoising loop
885
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
886
+
887
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
888
+ # for DPM-solver++
889
+ old_pred_original_sample = None
890
+ timesteps_cpu = timesteps.cpu()
891
+ for i, t in enumerate(timesteps):
892
+ if self.interrupt:
893
+ continue
894
+
895
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
896
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
897
+
898
+ latent_image_input = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
899
+ latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=2)
900
+
901
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
902
+ timestep = t.expand(latent_model_input.shape[0])
903
+
904
+ # predict noise model_output
905
+ noise_pred = self.transformer(
906
+ hidden_states=latent_model_input,
907
+ encoder_hidden_states=prompt_embeds,
908
+ timestep=timestep,
909
+ image_rotary_emb=image_rotary_emb,
910
+ attention_kwargs=attention_kwargs,
911
+ return_dict=False,
912
+ id_vit_hidden=id_vit_hidden,
913
+ id_cond=id_cond,
914
+ )[0]
915
+ noise_pred = noise_pred.float()
916
+
917
+ # perform guidance
918
+ if use_dynamic_cfg:
919
+ self._guidance_scale = 1 + guidance_scale * (
920
+ (
921
+ 1
922
+ - math.cos(
923
+ math.pi
924
+ * ((num_inference_steps - timesteps_cpu[i].item()) / num_inference_steps) ** 5.0
925
+ )
926
+ )
927
+ / 2
928
+ )
929
+ if do_classifier_free_guidance:
930
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
931
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
932
+
933
+ # compute the previous noisy sample x_t -> x_t-1
934
+ if not isinstance(self.scheduler, CogVideoXDPMScheduler):
935
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
936
+ else:
937
+ latents, old_pred_original_sample = self.scheduler.step(
938
+ noise_pred,
939
+ old_pred_original_sample,
940
+ t,
941
+ timesteps[i - 1] if i > 0 else None,
942
+ latents,
943
+ **extra_step_kwargs,
944
+ return_dict=False,
945
+ )
946
+ latents = latents.to(prompt_embeds.dtype)
947
+
948
+ # call the callback, if provided
949
+ if callback_on_step_end is not None:
950
+ callback_kwargs = {}
951
+ for k in callback_on_step_end_tensor_inputs:
952
+ callback_kwargs[k] = locals()[k]
953
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
954
+
955
+ latents = callback_outputs.pop("latents", latents)
956
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
957
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
958
+
959
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
960
+ progress_bar.update()
961
+
962
+ if not output_type == "latent":
963
+ video = self.decode_latents(latents)
964
+ video = self.video_processor.postprocess_video(video=video, output_type=output_type)
965
+ else:
966
+ video = latents
967
+
968
+ # Offload all models
969
+ self.maybe_free_model_hooks()
970
+
971
+ if not return_dict:
972
+ return (video,)
973
+
974
+ return ConsisIDPipelineOutput(frames=video)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/consisid/pipeline_output.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ import torch
4
+
5
+ from diffusers.utils import BaseOutput
6
+
7
+
8
+ @dataclass
9
+ class ConsisIDPipelineOutput(BaseOutput):
10
+ r"""
11
+ Output class for ConsisID pipelines.
12
+
13
+ Args:
14
+ frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
15
+ List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
16
+ denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
17
+ `(batch_size, num_frames, channels, height, width)`.
18
+ """
19
+
20
+ frames: torch.Tensor