silencer107 commited on
Commit
b41ec59
·
verified ·
1 Parent(s): 19d912d

Update src/pipeline.py

Browse files
Files changed (1) hide show
  1. src/pipeline.py +13 -1244
src/pipeline.py CHANGED
@@ -1,1246 +1,12 @@
1
  import torch
 
 
2
  from PIL.Image import Image
 
 
3
  from pipelines.models import TextToImageRequest
4
  from torch import Generator
5
 
6
- import inspect
7
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
-
9
- from transformers import (
10
- CLIPImageProcessor,
11
- CLIPTextModel,
12
- CLIPTextModelWithProjection,
13
- CLIPTokenizer,
14
- CLIPVisionModelWithProjection,
15
- )
16
-
17
- from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
18
- from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
19
- from diffusers.loaders import (
20
- FromSingleFileMixin,
21
- IPAdapterMixin,
22
- StableDiffusionXLLoraLoaderMixin,
23
- TextualInversionLoaderMixin,
24
- )
25
- from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
26
- from diffusers.models.attention_processor import (
27
- AttnProcessor2_0,
28
- FusedAttnProcessor2_0,
29
- XFormersAttnProcessor,
30
- )
31
- from diffusers.models.lora import adjust_lora_scale_text_encoder
32
- from diffusers.schedulers import KarrasDiffusionSchedulers
33
- from diffusers.utils import (
34
- USE_PEFT_BACKEND,
35
- deprecate,
36
- is_invisible_watermark_available,
37
- is_torch_xla_available,
38
- logging,
39
- replace_example_docstring,
40
- scale_lora_layers,
41
- unscale_lora_layers,
42
- )
43
- from diffusers.utils.torch_utils import randn_tensor
44
- from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
45
- from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
46
-
47
-
48
- if is_invisible_watermark_available():
49
- from .watermark import StableDiffusionXLWatermarker
50
-
51
- if is_torch_xla_available():
52
- import torch_xla.core.xla_model as xm
53
-
54
- XLA_AVAILABLE = True
55
- else:
56
- XLA_AVAILABLE = False
57
-
58
-
59
- logger = logging.get_logger(__name__) # pylint: disable=invalid-name
60
-
61
- EXAMPLE_DOC_STRING = """
62
- Examples:
63
- ```py
64
- >>> import torch
65
- >>> from diffusers import StableDiffusionXLPipeline
66
-
67
- >>> pipe = StableDiffusionXLPipeline.from_pretrained(
68
- diffusers. "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
69
- diffusers. )
70
- >>> pipe = pipe.to("cuda")
71
-
72
- >>> prompt = "a photo of an astronaut riding a horse on mars"
73
- >>> image = pipe(prompt).images[0]
74
- ```
75
- """
76
-
77
- def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
78
- """
79
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
80
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
81
- """
82
- std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
83
- std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
84
- # rescale the results from guidance (fixes overexposure)
85
- noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
86
- # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
87
- noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
88
- return noise_cfg
89
-
90
-
91
- def retrieve_timesteps(
92
- scheduler,
93
- num_inference_steps: Optional[int] = None,
94
- device: Optional[Union[str, torch.device]] = None,
95
- timesteps: Optional[List[int]] = None,
96
- sigmas: Optional[List[float]] = None,
97
- **kwargs,
98
- ):
99
- """
100
- Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
101
- custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
102
-
103
- Args:
104
- scheduler (`SchedulerMixin`):
105
- The scheduler to get timesteps from.
106
- num_inference_steps (`int`):
107
- The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
108
- must be `None`.
109
- device (`str` or `torch.device`, *optional*):
110
- The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
111
- timesteps (`List[int]`, *optional*):
112
- Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
113
- `num_inference_steps` and `sigmas` must be `None`.
114
- sigmas (`List[float]`, *optional*):
115
- Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
116
- `num_inference_steps` and `timesteps` must be `None`.
117
-
118
- Returns:
119
- `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
120
- second element is the number of inference steps.
121
- """
122
- if timesteps is not None and sigmas is not None:
123
- raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
124
- if timesteps is not None:
125
- accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
126
- if not accepts_timesteps:
127
- raise ValueError(
128
- f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
129
- f" timestep schedules. Please check whether you are using the correct scheduler."
130
- )
131
- scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
132
- timesteps = scheduler.timesteps
133
- num_inference_steps = len(timesteps)
134
- elif sigmas is not None:
135
- accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
136
- if not accept_sigmas:
137
- raise ValueError(
138
- f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
139
- f" sigmas schedules. Please check whether you are using the correct scheduler."
140
- )
141
- scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
142
- timesteps = scheduler.timesteps
143
- num_inference_steps = len(timesteps)
144
- else:
145
- scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
146
- timesteps = scheduler.timesteps
147
- return timesteps, num_inference_steps
148
-
149
-
150
- class StableDiffusionXLPipeline(
151
- DiffusionPipeline,
152
- StableDiffusionMixin,
153
- FromSingleFileMixin,
154
- StableDiffusionXLLoraLoaderMixin,
155
- TextualInversionLoaderMixin,
156
- IPAdapterMixin,
157
- ):
158
- r"""
159
- Pipeline for text-to-image generation using Stable Diffusion XL.
160
-
161
- This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
162
- library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
163
-
164
- The pipeline also inherits the following loading methods:
165
- - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
166
- - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
167
- - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
168
- - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
169
- - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
170
-
171
- Args:
172
- vae ([`AutoencoderKL`]):
173
- Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
174
- text_encoder ([`CLIPTextModel`]):
175
- Frozen text-encoder. Stable Diffusion XL uses the text portion of
176
- [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
177
- the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
178
- text_encoder_2 ([` CLIPTextModelWithProjection`]):
179
- Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
180
- [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
181
- specifically the
182
- [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
183
- variant.
184
- tokenizer (`CLIPTokenizer`):
185
- Tokenizer of class
186
- [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
187
- tokenizer_2 (`CLIPTokenizer`):
188
- Second Tokenizer of class
189
- [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
190
- unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
191
- scheduler ([`SchedulerMixin`]):
192
- A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
193
- [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
194
- force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
195
- Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
196
- `stabilityai/stable-diffusion-xl-base-1-0`.
197
- add_watermarker (`bool`, *optional*):
198
- Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
199
- watermark output images. If not defined, it will default to True if the package is installed, otherwise no
200
- watermarker will be used.
201
- """
202
-
203
- model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
204
- _optional_components = [
205
- "tokenizer",
206
- "tokenizer_2",
207
- "text_encoder",
208
- "text_encoder_2",
209
- "image_encoder",
210
- "feature_extractor",
211
- ]
212
- _callback_tensor_inputs = [
213
- "latents",
214
- "prompt_embeds",
215
- "negative_prompt_embeds",
216
- "add_text_embeds",
217
- "add_time_ids",
218
- "negative_pooled_prompt_embeds",
219
- "negative_add_time_ids",
220
- ]
221
-
222
- def __init__(
223
- self,
224
- vae: AutoencoderKL,
225
- text_encoder: CLIPTextModel,
226
- text_encoder_2: CLIPTextModelWithProjection,
227
- tokenizer: CLIPTokenizer,
228
- tokenizer_2: CLIPTokenizer,
229
- unet: UNet2DConditionModel,
230
- scheduler: KarrasDiffusionSchedulers,
231
- image_encoder: CLIPVisionModelWithProjection = None,
232
- feature_extractor: CLIPImageProcessor = None,
233
- force_zeros_for_empty_prompt: bool = True,
234
- add_watermarker: Optional[bool] = None,
235
- ):
236
- super().__init__()
237
-
238
- self.register_modules(
239
- vae=vae,
240
- text_encoder=text_encoder,
241
- text_encoder_2=text_encoder_2,
242
- tokenizer=tokenizer,
243
- tokenizer_2=tokenizer_2,
244
- unet=unet,
245
- scheduler=scheduler,
246
- image_encoder=image_encoder,
247
- feature_extractor=feature_extractor,
248
- )
249
- self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
250
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
251
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
252
-
253
- self.default_sample_size = self.unet.config.sample_size
254
-
255
- add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
256
-
257
- if add_watermarker:
258
- self.watermark = StableDiffusionXLWatermarker()
259
- else:
260
- self.watermark = None
261
-
262
- def encode_prompt(
263
- self,
264
- prompt: str,
265
- prompt_2: Optional[str] = None,
266
- device: Optional[torch.device] = None,
267
- num_images_per_prompt: int = 1,
268
- do_classifier_free_guidance: bool = True,
269
- negative_prompt: Optional[str] = None,
270
- negative_prompt_2: Optional[str] = None,
271
- prompt_embeds: Optional[torch.Tensor] = None,
272
- negative_prompt_embeds: Optional[torch.Tensor] = None,
273
- pooled_prompt_embeds: Optional[torch.Tensor] = None,
274
- negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
275
- lora_scale: Optional[float] = None,
276
- clip_skip: Optional[int] = None,
277
- ):
278
- r"""
279
- Encodes the prompt into text encoder hidden states.
280
-
281
- Args:
282
- prompt (`str` or `List[str]`, *optional*):
283
- prompt to be encoded
284
- prompt_2 (`str` or `List[str]`, *optional*):
285
- The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
286
- used in both text-encoders
287
- device: (`torch.device`):
288
- torch device
289
- num_images_per_prompt (`int`):
290
- number of images that should be generated per prompt
291
- do_classifier_free_guidance (`bool`):
292
- whether to use classifier free guidance or not
293
- negative_prompt (`str` or `List[str]`, *optional*):
294
- The prompt or prompts not to guide the image generation. If not defined, one has to pass
295
- `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
296
- less than `1`).
297
- negative_prompt_2 (`str` or `List[str]`, *optional*):
298
- The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
299
- `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
300
- prompt_embeds (`torch.Tensor`, *optional*):
301
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
302
- provided, text embeddings will be generated from `prompt` input argument.
303
- negative_prompt_embeds (`torch.Tensor`, *optional*):
304
- Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
305
- weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
306
- argument.
307
- pooled_prompt_embeds (`torch.Tensor`, *optional*):
308
- Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
309
- If not provided, pooled text embeddings will be generated from `prompt` input argument.
310
- negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
311
- Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
312
- weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
313
- input argument.
314
- lora_scale (`float`, *optional*):
315
- A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
316
- clip_skip (`int`, *optional*):
317
- Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
318
- the output of the pre-final layer will be used for computing the prompt embeddings.
319
- """
320
- device = device or self._execution_device
321
-
322
- if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
323
- self._lora_scale = lora_scale
324
-
325
- # dynamically adjust the LoRA scale
326
- if self.text_encoder is not None:
327
- if not USE_PEFT_BACKEND:
328
- adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
329
- else:
330
- scale_lora_layers(self.text_encoder, lora_scale)
331
-
332
- if self.text_encoder_2 is not None:
333
- if not USE_PEFT_BACKEND:
334
- adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
335
- else:
336
- scale_lora_layers(self.text_encoder_2, lora_scale)
337
-
338
- prompt = [prompt] if isinstance(prompt, str) else prompt
339
-
340
- if prompt is not None:
341
- batch_size = len(prompt)
342
- else:
343
- batch_size = prompt_embeds.shape[0]
344
-
345
- tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
346
- text_encoders = (
347
- [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
348
- )
349
-
350
- if prompt_embeds is None:
351
- prompt_2 = prompt_2 or prompt
352
- prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
353
-
354
- prompt_embeds_list = []
355
- prompts = [prompt, prompt_2]
356
- for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
357
- if isinstance(self, TextualInversionLoaderMixin):
358
- prompt = self.maybe_convert_prompt(prompt, tokenizer)
359
-
360
- text_inputs = tokenizer(
361
- prompt,
362
- padding="max_length",
363
- max_length=tokenizer.model_max_length,
364
- truncation=True,
365
- return_tensors="pt",
366
- )
367
-
368
- text_input_ids = text_inputs.input_ids
369
- untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
370
-
371
- if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
372
- text_input_ids, untruncated_ids
373
- ):
374
- removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
375
- logger.warning(
376
- "The following part of your input was truncated because CLIP can only handle sequences up to"
377
- f" {tokenizer.model_max_length} tokens: {removed_text}"
378
- )
379
-
380
- prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
381
-
382
- pooled_prompt_embeds = prompt_embeds[0]
383
- if clip_skip is None:
384
- prompt_embeds = prompt_embeds.hidden_states[-2]
385
- else:
386
- prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
387
-
388
- prompt_embeds_list.append(prompt_embeds)
389
-
390
- prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
391
-
392
- zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
393
- if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
394
- negative_prompt_embeds = torch.zeros_like(prompt_embeds)
395
- negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
396
- elif do_classifier_free_guidance and negative_prompt_embeds is None:
397
- negative_prompt = negative_prompt or ""
398
- negative_prompt_2 = negative_prompt_2 or negative_prompt
399
-
400
- negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
401
- negative_prompt_2 = (
402
- batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
403
- )
404
-
405
- uncond_tokens: List[str]
406
- if prompt is not None and type(prompt) is not type(negative_prompt):
407
- raise TypeError(
408
- f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
409
- f" {type(prompt)}."
410
- )
411
- elif batch_size != len(negative_prompt):
412
- raise ValueError(
413
- f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
414
- f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
415
- " the batch size of `prompt`."
416
- )
417
- else:
418
- uncond_tokens = [negative_prompt, negative_prompt_2]
419
-
420
- negative_prompt_embeds_list = []
421
- for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
422
- if isinstance(self, TextualInversionLoaderMixin):
423
- negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
424
-
425
- max_length = prompt_embeds.shape[1]
426
- uncond_input = tokenizer(
427
- negative_prompt,
428
- padding="max_length",
429
- max_length=max_length,
430
- truncation=True,
431
- return_tensors="pt",
432
- )
433
-
434
- negative_prompt_embeds = text_encoder(
435
- uncond_input.input_ids.to(device),
436
- output_hidden_states=True,
437
- )
438
- negative_pooled_prompt_embeds = negative_prompt_embeds[0]
439
- negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
440
-
441
- negative_prompt_embeds_list.append(negative_prompt_embeds)
442
-
443
- negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
444
-
445
- if self.text_encoder_2 is not None:
446
- prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
447
- else:
448
- prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
449
-
450
- bs_embed, seq_len, _ = prompt_embeds.shape
451
- prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
452
- prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
453
-
454
- if do_classifier_free_guidance:
455
- seq_len = negative_prompt_embeds.shape[1]
456
-
457
- if self.text_encoder_2 is not None:
458
- negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
459
- else:
460
- negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
461
-
462
- negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
463
- negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
464
-
465
- pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
466
- bs_embed * num_images_per_prompt, -1
467
- )
468
- if do_classifier_free_guidance:
469
- negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
470
- bs_embed * num_images_per_prompt, -1
471
- )
472
-
473
- if self.text_encoder is not None:
474
- if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
475
- unscale_lora_layers(self.text_encoder, lora_scale)
476
-
477
- if self.text_encoder_2 is not None:
478
- if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
479
- unscale_lora_layers(self.text_encoder_2, lora_scale)
480
-
481
- return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
482
-
483
- def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
484
- dtype = next(self.image_encoder.parameters()).dtype
485
-
486
- if not isinstance(image, torch.Tensor):
487
- image = self.feature_extractor(image, return_tensors="pt").pixel_values
488
-
489
- image = image.to(device=device, dtype=dtype)
490
- if output_hidden_states:
491
- image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
492
- image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
493
- uncond_image_enc_hidden_states = self.image_encoder(
494
- torch.zeros_like(image), output_hidden_states=True
495
- ).hidden_states[-2]
496
- uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
497
- num_images_per_prompt, dim=0
498
- )
499
- return image_enc_hidden_states, uncond_image_enc_hidden_states
500
- else:
501
- image_embeds = self.image_encoder(image).image_embeds
502
- image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
503
- uncond_image_embeds = torch.zeros_like(image_embeds)
504
-
505
- return image_embeds, uncond_image_embeds
506
-
507
- def prepare_ip_adapter_image_embeds(
508
- self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
509
- ):
510
- image_embeds = []
511
- if do_classifier_free_guidance:
512
- negative_image_embeds = []
513
- if ip_adapter_image_embeds is None:
514
- if not isinstance(ip_adapter_image, list):
515
- ip_adapter_image = [ip_adapter_image]
516
-
517
- if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
518
- raise ValueError(
519
- f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
520
- )
521
-
522
- for single_ip_adapter_image, image_proj_layer in zip(
523
- ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
524
- ):
525
- output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
526
- single_image_embeds, single_negative_image_embeds = self.encode_image(
527
- single_ip_adapter_image, device, 1, output_hidden_state
528
- )
529
-
530
- image_embeds.append(single_image_embeds[None, :])
531
- if do_classifier_free_guidance:
532
- negative_image_embeds.append(single_negative_image_embeds[None, :])
533
- else:
534
- for single_image_embeds in ip_adapter_image_embeds:
535
- if do_classifier_free_guidance:
536
- single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
537
- negative_image_embeds.append(single_negative_image_embeds)
538
- image_embeds.append(single_image_embeds)
539
-
540
- ip_adapter_image_embeds = []
541
- for i, single_image_embeds in enumerate(image_embeds):
542
- single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
543
- if do_classifier_free_guidance:
544
- single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
545
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
546
-
547
- single_image_embeds = single_image_embeds.to(device=device)
548
- ip_adapter_image_embeds.append(single_image_embeds)
549
-
550
- return ip_adapter_image_embeds
551
-
552
- def prepare_extra_step_kwargs(self, generator, eta):
553
-
554
- accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
555
- extra_step_kwargs = {}
556
- if accepts_eta:
557
- extra_step_kwargs["eta"] = eta
558
-
559
- # check if the scheduler accepts generator
560
- accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
561
- if accepts_generator:
562
- extra_step_kwargs["generator"] = generator
563
- return extra_step_kwargs
564
-
565
- def check_inputs(
566
- self,
567
- prompt,
568
- prompt_2,
569
- height,
570
- width,
571
- callback_steps,
572
- negative_prompt=None,
573
- negative_prompt_2=None,
574
- prompt_embeds=None,
575
- negative_prompt_embeds=None,
576
- pooled_prompt_embeds=None,
577
- negative_pooled_prompt_embeds=None,
578
- ip_adapter_image=None,
579
- ip_adapter_image_embeds=None,
580
- callback_on_step_end_tensor_inputs=None,
581
- ):
582
- if height % 8 != 0 or width % 8 != 0:
583
- raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
584
-
585
- if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
586
- raise ValueError(
587
- f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
588
- f" {type(callback_steps)}."
589
- )
590
-
591
- if callback_on_step_end_tensor_inputs is not None and not all(
592
- k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
593
- ):
594
- raise ValueError(
595
- f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
596
- )
597
-
598
- if prompt is not None and prompt_embeds is not None:
599
- raise ValueError(
600
- f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
601
- " only forward one of the two."
602
- )
603
- elif prompt_2 is not None and prompt_embeds is not None:
604
- raise ValueError(
605
- f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
606
- " only forward one of the two."
607
- )
608
- elif prompt is None and prompt_embeds is None:
609
- raise ValueError(
610
- "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
611
- )
612
- elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
613
- raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
614
- elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
615
- raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
616
-
617
- if negative_prompt is not None and negative_prompt_embeds is not None:
618
- raise ValueError(
619
- f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
620
- f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
621
- )
622
- elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
623
- raise ValueError(
624
- f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
625
- f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
626
- )
627
-
628
- if prompt_embeds is not None and negative_prompt_embeds is not None:
629
- if prompt_embeds.shape != negative_prompt_embeds.shape:
630
- raise ValueError(
631
- "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
632
- f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
633
- f" {negative_prompt_embeds.shape}."
634
- )
635
-
636
- if prompt_embeds is not None and pooled_prompt_embeds is None:
637
- raise ValueError(
638
- "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
639
- )
640
-
641
- if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
642
- raise ValueError(
643
- "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
644
- )
645
-
646
- if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
647
- raise ValueError(
648
- "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
649
- )
650
-
651
- if ip_adapter_image_embeds is not None:
652
- if not isinstance(ip_adapter_image_embeds, list):
653
- raise ValueError(
654
- f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
655
- )
656
- elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
657
- raise ValueError(
658
- f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
659
- )
660
-
661
- def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
662
- shape = (
663
- batch_size,
664
- num_channels_latents,
665
- int(height) // self.vae_scale_factor,
666
- int(width) // self.vae_scale_factor,
667
- )
668
- if isinstance(generator, list) and len(generator) != batch_size:
669
- raise ValueError(
670
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
671
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
672
- )
673
-
674
- if latents is None:
675
- latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
676
- else:
677
- latents = latents.to(device)
678
-
679
- # scale the initial noise by the standard deviation required by the scheduler
680
- latents = latents * self.scheduler.init_noise_sigma
681
- return latents
682
-
683
- def _get_add_time_ids(
684
- self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
685
- ):
686
- add_time_ids = list(original_size + crops_coords_top_left + target_size)
687
-
688
- passed_add_embed_dim = (
689
- self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
690
- )
691
- expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
692
-
693
- if expected_add_embed_dim != passed_add_embed_dim:
694
- raise ValueError(
695
- f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
696
- )
697
-
698
- add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
699
- return add_time_ids
700
-
701
- def upcast_vae(self):
702
- dtype = self.vae.dtype
703
- self.vae.to(dtype=torch.float32)
704
- use_torch_2_0_or_xformers = isinstance(
705
- self.vae.decoder.mid_block.attentions[0].processor,
706
- (
707
- AttnProcessor2_0,
708
- XFormersAttnProcessor,
709
- FusedAttnProcessor2_0,
710
- ),
711
- )
712
- if use_torch_2_0_or_xformers:
713
- self.vae.post_quant_conv.to(dtype)
714
- self.vae.decoder.conv_in.to(dtype)
715
- self.vae.decoder.mid_block.to(dtype)
716
-
717
- def get_guidance_scale_embedding(
718
- self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
719
- ) -> torch.Tensor:
720
- """
721
- See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
722
-
723
- Args:
724
- w (`torch.Tensor`):
725
- Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
726
- embedding_dim (`int`, *optional*, defaults to 512):
727
- Dimension of the embeddings to generate.
728
- dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
729
- Data type of the generated embeddings.
730
-
731
- Returns:
732
- `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
733
- """
734
- assert len(w.shape) == 1
735
- w = w * 1000.0
736
-
737
- half_dim = embedding_dim // 2
738
- emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
739
- emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
740
- emb = w.to(dtype)[:, None] * emb[None, :]
741
- emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
742
- if embedding_dim % 2 == 1: # zero pad
743
- emb = torch.nn.functional.pad(emb, (0, 1))
744
- assert emb.shape == (w.shape[0], embedding_dim)
745
- return emb
746
-
747
- @property
748
- def guidance_scale(self):
749
- return self._guidance_scale
750
-
751
- @property
752
- def guidance_rescale(self):
753
- return self._guidance_rescale
754
-
755
- @property
756
- def clip_skip(self):
757
- return self._clip_skip
758
-
759
- @property
760
- def do_classifier_free_guidance(self):
761
- return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
762
-
763
- @property
764
- def cross_attention_kwargs(self):
765
- return self._cross_attention_kwargs
766
-
767
- @property
768
- def denoising_end(self):
769
- return self._denoising_end
770
-
771
- @property
772
- def num_timesteps(self):
773
- return self._num_timesteps
774
-
775
- @property
776
- def interrupt(self):
777
- return self._interrupt
778
-
779
- @torch.no_grad()
780
- @replace_example_docstring(EXAMPLE_DOC_STRING)
781
- def __call__(
782
- self,
783
- prompt: Union[str, List[str]] = None,
784
- prompt_2: Optional[Union[str, List[str]]] = None,
785
- height: Optional[int] = None,
786
- width: Optional[int] = None,
787
- num_inference_steps: int = 50,
788
- timesteps: List[int] = None,
789
- sigmas: List[float] = None,
790
- denoising_end: Optional[float] = None,
791
- guidance_scale: float = 5.0,
792
- negative_prompt: Optional[Union[str, List[str]]] = None,
793
- negative_prompt_2: Optional[Union[str, List[str]]] = None,
794
- num_images_per_prompt: Optional[int] = 1,
795
- eta: float = 0.0,
796
- generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
797
- latents: Optional[torch.Tensor] = None,
798
- prompt_embeds: Optional[torch.Tensor] = None,
799
- negative_prompt_embeds: Optional[torch.Tensor] = None,
800
- pooled_prompt_embeds: Optional[torch.Tensor] = None,
801
- negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
802
- ip_adapter_image: Optional[PipelineImageInput] = None,
803
- ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
804
- output_type: Optional[str] = "pil",
805
- return_dict: bool = True,
806
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
807
- guidance_rescale: float = 0.0,
808
- end_cfg: float = 0.73,
809
- original_size: Optional[Tuple[int, int]] = None,
810
- crops_coords_top_left: Tuple[int, int] = (0, 0),
811
- target_size: Optional[Tuple[int, int]] = None,
812
- negative_original_size: Optional[Tuple[int, int]] = None,
813
- negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
814
- negative_target_size: Optional[Tuple[int, int]] = None,
815
- clip_skip: Optional[int] = None,
816
- callback_on_step_end: Optional[
817
- Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
818
- ] = None,
819
- callback_on_step_end_tensor_inputs: List[str] = ["latents"],
820
- **kwargs,
821
- ):
822
- r"""
823
- Function invoked when calling the pipeline for generation.
824
-
825
- Args:
826
- prompt (`str` or `List[str]`, *optional*):
827
- The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
828
- instead.
829
- prompt_2 (`str` or `List[str]`, *optional*):
830
- The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
831
- used in both text-encoders
832
- height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
833
- The height in pixels of the generated image. This is set to 1024 by default for the best results.
834
- Anything below 512 pixels won't work well for
835
- [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
836
- and checkpoints that are not specifically fine-tuned on low resolutions.
837
- width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
838
- The width in pixels of the generated image. This is set to 1024 by default for the best results.
839
- Anything below 512 pixels won't work well for
840
- [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
841
- and checkpoints that are not specifically fine-tuned on low resolutions.
842
- num_inference_steps (`int`, *optional*, defaults to 50):
843
- The number of denoising steps. More denoising steps usually lead to a higher quality image at the
844
- expense of slower inference.
845
- timesteps (`List[int]`, *optional*):
846
- Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
847
- in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
848
- passed will be used. Must be in descending order.
849
- sigmas (`List[float]`, *optional*):
850
- Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
851
- their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
852
- will be used.
853
- denoising_end (`float`, *optional*):
854
- When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
855
- completed before it is intentionally prematurely terminated. As a result, the returned sample will
856
- still retain a substantial amount of noise as determined by the discrete timesteps selected by the
857
- scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
858
- "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
859
- Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
860
- guidance_scale (`float`, *optional*, defaults to 5.0):
861
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
862
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
863
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
864
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
865
- usually at the expense of lower image quality.
866
- negative_prompt (`str` or `List[str]`, *optional*):
867
- The prompt or prompts not to guide the image generation. If not defined, one has to pass
868
- `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
869
- less than `1`).
870
- negative_prompt_2 (`str` or `List[str]`, *optional*):
871
- The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
872
- `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
873
- num_images_per_prompt (`int`, *optional*, defaults to 1):
874
- The number of images to generate per prompt.
875
- eta (`float`, *optional*, defaults to 0.0):
876
- Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
877
- [`schedulers.DDIMScheduler`], will be ignored for others.
878
- generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
879
- One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
880
- to make generation deterministic.
881
- latents (`torch.Tensor`, *optional*):
882
- Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
883
- generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
884
- tensor will ge generated by sampling using the supplied random `generator`.
885
- prompt_embeds (`torch.Tensor`, *optional*):
886
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
887
- provided, text embeddings will be generated from `prompt` input argument.
888
- negative_prompt_embeds (`torch.Tensor`, *optional*):
889
- Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
890
- weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
891
- argument.
892
- pooled_prompt_embeds (`torch.Tensor`, *optional*):
893
- Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
894
- If not provided, pooled text embeddings will be generated from `prompt` input argument.
895
- negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
896
- Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
897
- weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
898
- input argument.
899
- ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
900
- ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
901
- Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
902
- IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
903
- contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
904
- provided, embeddings are computed from the `ip_adapter_image` input argument.
905
- output_type (`str`, *optional*, defaults to `"pil"`):
906
- The output format of the generate image. Choose between
907
- [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
908
- return_dict (`bool`, *optional*, defaults to `True`):
909
- Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
910
- of a plain tuple.
911
- cross_attention_kwargs (`dict`, *optional*):
912
- A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
913
- `self.processor` in
914
- [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
915
- guidance_rescale (`float`, *optional*, defaults to 0.0):
916
- Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
917
- Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
918
- [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
919
- Guidance rescale factor should fix overexposure when using zero terminal SNR.
920
- original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
921
- If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
922
- `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
923
- explained in section 2.2 of
924
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
925
- crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
926
- `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
927
- `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
928
- `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
929
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
930
- target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
931
- For most cases, `target_size` should be set to the desired height and width of the generated image. If
932
- not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
933
- section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
934
- negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
935
- To negatively condition the generation process based on a specific image resolution. Part of SDXL's
936
- micro-conditioning as explained in section 2.2 of
937
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
938
- information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
939
- negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
940
- To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
941
- micro-conditioning as explained in section 2.2 of
942
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
943
- information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
944
- negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
945
- To negatively condition the generation process based on a target image resolution. It should be as same
946
- as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
947
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
948
- information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
949
- callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
950
- A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
951
- each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
952
- DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
953
- list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
954
- callback_on_step_end_tensor_inputs (`List`, *optional*):
955
- The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
956
- will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
957
- `._callback_tensor_inputs` attribute of your pipeline class.
958
-
959
- Examples:
960
-
961
- Returns:
962
- [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
963
- [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
964
- `tuple`. When returning a tuple, the first element is a list with the generated images.
965
- """
966
-
967
- callback = kwargs.pop("callback", None)
968
- callback_steps = kwargs.pop("callback_steps", None)
969
-
970
- if callback is not None:
971
- deprecate(
972
- "callback",
973
- "1.0.0",
974
- "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
975
- )
976
- if callback_steps is not None:
977
- deprecate(
978
- "callback_steps",
979
- "1.0.0",
980
- "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
981
- )
982
-
983
- if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
984
- callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
985
-
986
- height = height or self.default_sample_size * self.vae_scale_factor
987
- width = width or self.default_sample_size * self.vae_scale_factor
988
-
989
- original_size = original_size or (height, width)
990
- target_size = target_size or (height, width)
991
-
992
- self.check_inputs(
993
- prompt,
994
- prompt_2,
995
- height,
996
- width,
997
- callback_steps,
998
- negative_prompt,
999
- negative_prompt_2,
1000
- prompt_embeds,
1001
- negative_prompt_embeds,
1002
- pooled_prompt_embeds,
1003
- negative_pooled_prompt_embeds,
1004
- ip_adapter_image,
1005
- ip_adapter_image_embeds,
1006
- callback_on_step_end_tensor_inputs,
1007
- )
1008
-
1009
- self._guidance_scale = guidance_scale
1010
- self._guidance_rescale = guidance_rescale
1011
- self._clip_skip = clip_skip
1012
- self._cross_attention_kwargs = cross_attention_kwargs
1013
- self._denoising_end = denoising_end
1014
- self._interrupt = False
1015
-
1016
- if prompt is not None and isinstance(prompt, str):
1017
- batch_size = 1
1018
- elif prompt is not None and isinstance(prompt, list):
1019
- batch_size = len(prompt)
1020
- else:
1021
- batch_size = prompt_embeds.shape[0]
1022
-
1023
- device = self._execution_device
1024
-
1025
- lora_scale = (
1026
- self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
1027
- )
1028
-
1029
- (
1030
- prompt_embeds,
1031
- negative_prompt_embeds,
1032
- pooled_prompt_embeds,
1033
- negative_pooled_prompt_embeds,
1034
- ) = self.encode_prompt(
1035
- prompt=prompt,
1036
- prompt_2=prompt_2,
1037
- device=device,
1038
- num_images_per_prompt=num_images_per_prompt,
1039
- do_classifier_free_guidance=self.do_classifier_free_guidance,
1040
- negative_prompt=negative_prompt,
1041
- negative_prompt_2=negative_prompt_2,
1042
- prompt_embeds=prompt_embeds,
1043
- negative_prompt_embeds=negative_prompt_embeds,
1044
- pooled_prompt_embeds=pooled_prompt_embeds,
1045
- negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
1046
- lora_scale=lora_scale,
1047
- clip_skip=self.clip_skip,
1048
- )
1049
-
1050
- timesteps, num_inference_steps = retrieve_timesteps(
1051
- self.scheduler, num_inference_steps, device, timesteps, sigmas
1052
- )
1053
-
1054
- # 5. Prepare latent variables
1055
- num_channels_latents = self.unet.config.in_channels
1056
- latents = self.prepare_latents(
1057
- batch_size * num_images_per_prompt,
1058
- num_channels_latents,
1059
- height,
1060
- width,
1061
- prompt_embeds.dtype,
1062
- device,
1063
- generator,
1064
- latents,
1065
- )
1066
-
1067
- extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1068
-
1069
- add_text_embeds = pooled_prompt_embeds
1070
- if self.text_encoder_2 is None:
1071
- text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
1072
- else:
1073
- text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
1074
-
1075
- add_time_ids = self._get_add_time_ids(
1076
- original_size,
1077
- crops_coords_top_left,
1078
- target_size,
1079
- dtype=prompt_embeds.dtype,
1080
- text_encoder_projection_dim=text_encoder_projection_dim,
1081
- )
1082
- if negative_original_size is not None and negative_target_size is not None:
1083
- negative_add_time_ids = self._get_add_time_ids(
1084
- negative_original_size,
1085
- negative_crops_coords_top_left,
1086
- negative_target_size,
1087
- dtype=prompt_embeds.dtype,
1088
- text_encoder_projection_dim=text_encoder_projection_dim,
1089
- )
1090
- else:
1091
- negative_add_time_ids = add_time_ids
1092
-
1093
- if self.do_classifier_free_guidance:
1094
- prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
1095
- add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
1096
- add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
1097
-
1098
- prompt_embeds = prompt_embeds.to(device)
1099
- add_text_embeds = add_text_embeds.to(device)
1100
- add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
1101
-
1102
- if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1103
- image_embeds = self.prepare_ip_adapter_image_embeds(
1104
- ip_adapter_image,
1105
- ip_adapter_image_embeds,
1106
- device,
1107
- batch_size * num_images_per_prompt,
1108
- self.do_classifier_free_guidance,
1109
- )
1110
-
1111
- num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
1112
-
1113
- if (
1114
- self.denoising_end is not None
1115
- and isinstance(self.denoising_end, float)
1116
- and self.denoising_end > 0
1117
- and self.denoising_end < 1
1118
- ):
1119
- discrete_timestep_cutoff = int(
1120
- round(
1121
- self.scheduler.config.num_train_timesteps
1122
- - (self.denoising_end * self.scheduler.config.num_train_timesteps)
1123
- )
1124
- )
1125
- num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
1126
- timesteps = timesteps[:num_inference_steps]
1127
-
1128
- timestep_cond = None
1129
- if self.unet.config.time_cond_proj_dim is not None:
1130
- guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
1131
- timestep_cond = self.get_guidance_scale_embedding(
1132
- guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
1133
- ).to(device=device, dtype=latents.dtype)
1134
-
1135
- self._num_timesteps = len(timesteps)
1136
- with self.progress_bar(total=num_inference_steps) as progress_bar:
1137
- do_classifier_free_guidance = self.do_classifier_free_guidance
1138
- for i, t in enumerate(timesteps):
1139
- if self.interrupt:
1140
- continue
1141
- if end_cfg is not None and i / num_inference_steps > end_cfg and do_classifier_free_guidance:
1142
- do_classifier_free_guidance = False
1143
- prompt_embeds = torch.chunk(prompt_embeds, 2, dim=0)[-1]
1144
- add_text_embeds = torch.chunk(add_text_embeds, 2, dim=0)[-1]
1145
- add_time_ids = torch.chunk(add_time_ids, 2, dim=0)[-1]
1146
- latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
1147
-
1148
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1149
-
1150
- added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
1151
- if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1152
- added_cond_kwargs["image_embeds"] = image_embeds
1153
- noise_pred = self.unet(
1154
- latent_model_input,
1155
- t,
1156
- encoder_hidden_states=prompt_embeds,
1157
- timestep_cond=timestep_cond,
1158
- cross_attention_kwargs=self.cross_attention_kwargs,
1159
- added_cond_kwargs=added_cond_kwargs,
1160
- return_dict=False,
1161
- )[0]
1162
-
1163
- if do_classifier_free_guidance:
1164
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1165
- noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
1166
-
1167
- if do_classifier_free_guidance and self.guidance_rescale > 0.0:
1168
- noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
1169
-
1170
- latents_dtype = latents.dtype
1171
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1172
- if latents.dtype != latents_dtype:
1173
- if torch.backends.mps.is_available():
1174
- latents = latents.to(latents_dtype)
1175
-
1176
- if callback_on_step_end is not None:
1177
- callback_kwargs = {}
1178
- for k in callback_on_step_end_tensor_inputs:
1179
- callback_kwargs[k] = locals()[k]
1180
- callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1181
-
1182
- latents = callback_outputs.pop("latents", latents)
1183
- prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1184
- negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1185
- add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
1186
- negative_pooled_prompt_embeds = callback_outputs.pop(
1187
- "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
1188
- )
1189
- add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
1190
- negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
1191
-
1192
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1193
- progress_bar.update()
1194
- if callback is not None and i % callback_steps == 0:
1195
- step_idx = i // getattr(self.scheduler, "order", 1)
1196
- callback(step_idx, t, latents)
1197
-
1198
- if XLA_AVAILABLE:
1199
- xm.mark_step()
1200
-
1201
- if not output_type == "latent":
1202
- needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
1203
-
1204
- if needs_upcasting:
1205
- self.upcast_vae()
1206
- latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
1207
- elif latents.dtype != self.vae.dtype:
1208
- if torch.backends.mps.is_available():
1209
- self.vae = self.vae.to(latents.dtype)
1210
-
1211
- has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
1212
- has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
1213
- if has_latents_mean and has_latents_std:
1214
- latents_mean = (
1215
- torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
1216
- )
1217
- latents_std = (
1218
- torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
1219
- )
1220
- latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
1221
- else:
1222
- latents = latents / self.vae.config.scaling_factor
1223
-
1224
- image = self.vae.decode(latents, return_dict=False)[0]
1225
-
1226
- if needs_upcasting:
1227
- self.vae.to(dtype=torch.float16)
1228
- else:
1229
- image = latents
1230
-
1231
- if not output_type == "latent":
1232
- if self.watermark is not None:
1233
- image = self.watermark.apply_watermark(image)
1234
-
1235
- image = self.image_processor.postprocess(image, output_type=output_type)
1236
-
1237
- self.maybe_free_model_hooks()
1238
-
1239
- if not return_dict:
1240
- return (image,)
1241
-
1242
- return StableDiffusionXLPipelineOutput(images=image)
1243
-
1244
  #from onediff.schedulers import EulerDiscreteScheduler
1245
  from onediffx import compile_pipe
1246
 
@@ -1256,15 +22,12 @@ def load_pipeline(pipeline=None) -> StableDiffusionXLPipeline:
1256
  # pipeline.scheduler.config,)
1257
 
1258
  pipeline.to("cuda")
 
1259
  pipeline = compile_pipe(pipeline)
1260
  for _ in range(4):
1261
- pipeline(prompt="kamala harris flying to the moon", num_inference_steps=20, end_cfg=0.73)
1262
-
1263
-
1264
-
1265
  return pipeline
1266
 
1267
-
1268
  def infer(request: TextToImageRequest, pipeline: StableDiffusionXLPipeline) -> Image:
1269
  if request.seed is None:
1270
  generator = None
@@ -1277,6 +40,12 @@ def infer(request: TextToImageRequest, pipeline: StableDiffusionXLPipeline) -> I
1277
  width=request.width,
1278
  height=request.height,
1279
  generator=generator,
1280
- end_cfg=0.73,
1281
  num_inference_steps=20,
 
 
 
 
 
 
1282
  ).images[0]
 
1
  import torch
2
+ #import xformers
3
+ #import triton
4
  from PIL.Image import Image
5
+ from onediffx.deep_cache import StableDiffusionXLPipeline
6
+ #from diffusers import StableDiffusionXLPipeline
7
  from pipelines.models import TextToImageRequest
8
  from torch import Generator
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  #from onediff.schedulers import EulerDiscreteScheduler
11
  from onediffx import compile_pipe
12
 
 
22
  # pipeline.scheduler.config,)
23
 
24
  pipeline.to("cuda")
25
+
26
  pipeline = compile_pipe(pipeline)
27
  for _ in range(4):
28
+ deepcache_output = pipeline(prompt="kamala harris defends my submission", output_type="pil", cache_interval=1, cache_layer_id=1, cache_block_id=0)
 
 
 
29
  return pipeline
30
 
 
31
  def infer(request: TextToImageRequest, pipeline: StableDiffusionXLPipeline) -> Image:
32
  if request.seed is None:
33
  generator = None
 
40
  width=request.width,
41
  height=request.height,
42
  generator=generator,
43
+ end_cfg=0.5,
44
  num_inference_steps=20,
45
+ cache_interval=1,
46
+ cache_layer_id=1,
47
+ cache_block_id=0,
48
+ eta=1,
49
+ guidance_scale = 5.0,
50
+ guidance_rescale = 0.0,
51
  ).images[0]