| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | import torch |
| | from typing import Any, Callable, Dict, List, Union, Optional |
| | from diffusers.image_processor import PipelineImageInput, VaeImageProcessor |
| | from diffusers.models.autoencoders import AutoencoderKL |
| | from diffusers.models.transformers import SD3Transformer2DModel |
| | from diffusers.schedulers import FlowMatchEulerDiscreteScheduler |
| | from transformers import ( |
| | CLIPTextModelWithProjection, |
| | CLIPTokenizer, |
| | SiglipImageProcessor, |
| | SiglipVisionModel, |
| | T5EncoderModel, |
| | T5TokenizerFast, |
| | ) |
| |
|
| | from diffusers.utils import ( |
| | USE_PEFT_BACKEND, |
| | is_torch_xla_available, |
| | logging, |
| | replace_example_docstring, |
| | scale_lora_layers, |
| | unscale_lora_layers, |
| | ) |
| | from diffusers.pipelines.stable_diffusion_3.pipeline_output import StableDiffusion3PipelineOutput |
| | from diffusers import StableDiffusion3Pipeline |
| |
|
| | if is_torch_xla_available(): |
| | import torch_xla.core.xla_model as xm |
| |
|
| | XLA_AVAILABLE = True |
| | else: |
| | XLA_AVAILABLE = False |
| |
|
| |
|
| |
|
| | class SwDPipeline(StableDiffusion3Pipeline): |
| |
|
| | @torch.no_grad() |
| | def __call__( |
| | self, |
| | prompt: Union[str, List[str]] = None, |
| | prompt_2: Optional[Union[str, List[str]]] = None, |
| | prompt_3: Optional[Union[str, List[str]]] = None, |
| | height: Optional[int] = None, |
| | width: Optional[int] = None, |
| | num_inference_steps: int = 28, |
| | sigmas: Optional[List[float]] = None, |
| | timesteps: Optional[List[float]] = None, |
| | scales: List[float] = None, |
| | guidance_scale: float = 7.0, |
| | negative_prompt: Optional[Union[str, List[str]]] = None, |
| | negative_prompt_2: Optional[Union[str, List[str]]] = None, |
| | negative_prompt_3: Optional[Union[str, List[str]]] = None, |
| | num_images_per_prompt: Optional[int] = 1, |
| | generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, |
| | latents: Optional[torch.FloatTensor] = None, |
| | prompt_embeds: Optional[torch.FloatTensor] = None, |
| | negative_prompt_embeds: Optional[torch.FloatTensor] = None, |
| | pooled_prompt_embeds: Optional[torch.FloatTensor] = None, |
| | negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, |
| | ip_adapter_image: Optional[PipelineImageInput] = None, |
| | ip_adapter_image_embeds: Optional[torch.Tensor] = None, |
| | output_type: Optional[str] = "pil", |
| | return_dict: bool = True, |
| | joint_attention_kwargs: Optional[Dict[str, Any]] = None, |
| | clip_skip: Optional[int] = None, |
| | callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, |
| | callback_on_step_end_tensor_inputs: List[str] = ["latents"], |
| | max_sequence_length: int = 256, |
| | skip_guidance_layers: List[int] = None, |
| | skip_layer_guidance_scale: float = 2.8, |
| | skip_layer_guidance_stop: float = 0.2, |
| | skip_layer_guidance_start: float = 0.01, |
| | mu: Optional[float] = None, |
| | ): |
| | height = height or self.default_sample_size * self.vae_scale_factor |
| | width = width or self.default_sample_size * self.vae_scale_factor |
| |
|
| | |
| | self.check_inputs( |
| | prompt, |
| | prompt_2, |
| | prompt_3, |
| | height, |
| | width, |
| | negative_prompt=negative_prompt, |
| | negative_prompt_2=negative_prompt_2, |
| | negative_prompt_3=negative_prompt_3, |
| | prompt_embeds=prompt_embeds, |
| | negative_prompt_embeds=negative_prompt_embeds, |
| | pooled_prompt_embeds=pooled_prompt_embeds, |
| | negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, |
| | callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, |
| | max_sequence_length=max_sequence_length, |
| | ) |
| |
|
| | self._guidance_scale = guidance_scale |
| | self._skip_layer_guidance_scale = skip_layer_guidance_scale |
| | self._clip_skip = clip_skip |
| | self._joint_attention_kwargs = joint_attention_kwargs |
| | self._interrupt = False |
| |
|
| | |
| | if prompt is not None and isinstance(prompt, str): |
| | batch_size = 1 |
| | elif prompt is not None and isinstance(prompt, list): |
| | batch_size = len(prompt) |
| | else: |
| | batch_size = prompt_embeds.shape[0] |
| |
|
| | device = self._execution_device |
| |
|
| | lora_scale = ( |
| | self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None |
| | ) |
| | ( |
| | prompt_embeds, |
| | negative_prompt_embeds, |
| | pooled_prompt_embeds, |
| | negative_pooled_prompt_embeds, |
| | ) = self.encode_prompt( |
| | prompt=prompt, |
| | prompt_2=prompt_2, |
| | prompt_3=prompt_3, |
| | negative_prompt=negative_prompt, |
| | negative_prompt_2=negative_prompt_2, |
| | negative_prompt_3=negative_prompt_3, |
| | do_classifier_free_guidance=self.do_classifier_free_guidance, |
| | prompt_embeds=prompt_embeds, |
| | negative_prompt_embeds=negative_prompt_embeds, |
| | pooled_prompt_embeds=pooled_prompt_embeds, |
| | negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, |
| | device=device, |
| | clip_skip=self.clip_skip, |
| | num_images_per_prompt=num_images_per_prompt, |
| | max_sequence_length=max_sequence_length, |
| | lora_scale=lora_scale, |
| | ) |
| |
|
| | if self.do_classifier_free_guidance: |
| | if skip_guidance_layers is not None: |
| | original_prompt_embeds = prompt_embeds |
| | original_pooled_prompt_embeds = pooled_prompt_embeds |
| | prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) |
| | pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0) |
| |
|
| | |
| | num_channels_latents = self.transformer.config.in_channels |
| | latents = self.prepare_latents( |
| | batch_size * num_images_per_prompt, |
| | num_channels_latents, |
| | height, |
| | width, |
| | prompt_embeds.dtype, |
| | device, |
| | generator, |
| | latents, |
| | ) |
| |
|
| | |
| | scheduler_kwargs = {} |
| | num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) |
| | self._num_timesteps = len(timesteps) |
| |
|
| | |
| | if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None: |
| | ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds( |
| | ip_adapter_image, |
| | ip_adapter_image_embeds, |
| | device, |
| | batch_size * num_images_per_prompt, |
| | self.do_classifier_free_guidance, |
| | ) |
| |
|
| | if self.joint_attention_kwargs is None: |
| | self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds} |
| | else: |
| | self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds) |
| |
|
| | |
| | with self.progress_bar(total=len(timesteps)) as progress_bar: |
| | for i, t in enumerate(timesteps): |
| | if self.interrupt: |
| | continue |
| |
|
| | |
| | latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents |
| | |
| | timestep = t.expand(latent_model_input.shape[0]) |
| |
|
| | noise_pred = self.transformer( |
| | hidden_states=latent_model_input, |
| | timestep=timestep, |
| | encoder_hidden_states=prompt_embeds, |
| | pooled_projections=pooled_prompt_embeds, |
| | joint_attention_kwargs=self.joint_attention_kwargs, |
| | return_dict=False, |
| | )[0] |
| |
|
| | |
| | if self.do_classifier_free_guidance: |
| | noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) |
| | noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) |
| | should_skip_layers = ( |
| | True |
| | if i > num_inference_steps * skip_layer_guidance_start |
| | and i < num_inference_steps * skip_layer_guidance_stop |
| | else False |
| | ) |
| | if skip_guidance_layers is not None and should_skip_layers: |
| | timestep = t.expand(latents.shape[0]) |
| | latent_model_input = latents |
| | noise_pred_skip_layers = self.transformer( |
| | hidden_states=latent_model_input, |
| | timestep=timestep, |
| | encoder_hidden_states=original_prompt_embeds, |
| | pooled_projections=original_pooled_prompt_embeds, |
| | joint_attention_kwargs=self.joint_attention_kwargs, |
| | return_dict=False, |
| | skip_layers=skip_guidance_layers, |
| | )[0] |
| | noise_pred = ( |
| | noise_pred + ( |
| | noise_pred_text - noise_pred_skip_layers) * self._skip_layer_guidance_scale |
| | ) |
| |
|
| | |
| | latents_dtype = latents.dtype |
| | sigma = sigmas[i] |
| | sigma_next = sigmas[i + 1] |
| | x0_pred = (latents - sigma * noise_pred) |
| | if scales and i + 1 < len(scales): |
| | x0_pred = torch.nn.functional.interpolate(x0_pred, size=scales[i + 1], mode='bicubic') |
| | noise = torch.randn(x0_pred.shape, generator=generator, dtype=x0_pred.dtype).to(x0_pred.device) |
| | latents = (1 - sigma_next) * x0_pred + sigma_next * noise |
| |
|
| | if latents.dtype != latents_dtype: |
| | if torch.backends.mps.is_available(): |
| | |
| | latents = latents.to(latents_dtype) |
| |
|
| | if callback_on_step_end is not None: |
| | callback_kwargs = {} |
| | for k in callback_on_step_end_tensor_inputs: |
| | callback_kwargs[k] = locals()[k] |
| | callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) |
| |
|
| | latents = callback_outputs.pop("latents", latents) |
| | prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) |
| | negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) |
| | negative_pooled_prompt_embeds = callback_outputs.pop( |
| | "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds |
| | ) |
| |
|
| | |
| | if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): |
| | progress_bar.update() |
| |
|
| | if XLA_AVAILABLE: |
| | xm.mark_step() |
| |
|
| | if output_type == "latent": |
| | image = latents |
| |
|
| | else: |
| | latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor |
| |
|
| | image = self.vae.decode(latents, return_dict=False)[0] |
| | image = self.image_processor.postprocess(image, output_type=output_type) |
| |
|
| | |
| | self.maybe_free_model_hooks() |
| |
|
| | if not return_dict: |
| | return (image,) |
| |
|
| | return StableDiffusion3PipelineOutput(images=image) |