thisiswooyeol
/

MIGC-diffusers

Text-to-Image

English

Model card Files Files and versions

xet

Community

thisiswooyeol commited on Sep 5, 2025

Commit

5d03d1e

verified ·

1 Parent(s): 5feb9d1

Upload pipeline_stable_diffusion_migc.py

Browse files

Files changed (1) hide show

pipeline_stable_diffusion_migc.py +110 -72

pipeline_stable_diffusion_migc.py CHANGED Viewed

@@ -22,11 +22,9 @@ from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from diffusers.utils.torch_utils import randn_tensor
 from packaging import version
-from scipy.ndimage import uniform_filter
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-# from utils import load_utils
-from core.diffusion.migc.mich_arch import MIGC, NaiveFuser
 logger = logging.get_logger(__name__)
@@ -51,7 +49,8 @@ class MIGCProcessor(AttnProcessor):
         hidden_states: torch.Tensor,
         encoder_hidden_states: torch.Tensor | None = None,
         attention_mask: torch.Tensor | None = None,
-        encoder_hidden_states_phrases=None,
         bboxes: List[List[float]] = [],
         ith: int = 0,
         embeds_pooler: torch.Tensor | None = None,
@@ -62,14 +61,39 @@ class MIGCProcessor(AttnProcessor):
         ca_scale: float | None = None,
         ea_scale: float | None = None,
         sac_scale: float | None = None,
     ):
         batch_size, sequence_length, _ = hidden_states.shape
         assert batch_size == 1 or batch_size == 2, (
             "We currently only implement sampling with batch_size=1, and we will implement sampling with batch_size=N as soon as possible."
         )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
         instance_num = len(bboxes)
         if ith > MIGCsteps:
@@ -80,43 +104,67 @@ class MIGCProcessor(AttnProcessor):
         is_cross = encoder_hidden_states is not None
-        # ori_hidden_states = hidden_states.clone()
         # In this case, we need to use MIGC or naive_fuser, so
         # 1. We concat prompt embeds and phrases embeds
         # 2. we copy the hidden_states_cond (instance_num+1) times for QKV
         if is_cross and not is_vanilla_cross:
             encoder_hidden_states = torch.cat([encoder_hidden_states, encoder_hidden_states_phrases])
             # print(encoder_hidden_states.shape)
             hidden_states_uncond = hidden_states[[0], ...]
             hidden_states_cond = hidden_states[[1], ...].repeat(instance_num + 1, 1, 1)
             hidden_states = torch.cat([hidden_states_uncond, hidden_states_cond])
-        # QKV Operation of Vanilla Self-Attention or Cross-Attention
         query = attn.to_q(hidden_states)
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
         hidden_states = F.scaled_dot_product_attention(
             query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
         )
-        # attention_probs = attn.get_attention_scores(query, key, attention_mask)  # 48 4096 77
-        # hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
         # linear proj
         hidden_states = attn.to_out[0](hidden_states)
         # dropout
         hidden_states = attn.to_out[1](hidden_states)
         ###### Self-Attention Results ######
         if not is_cross:
             return hidden_states
@@ -129,42 +177,12 @@ class MIGCProcessor(AttnProcessor):
         # hidden_states: torch.Size([1+1+instance_num, HW, C]), the first 1 is the uncond ca output, the second 1 is the global ca output.
         hidden_states_uncond = hidden_states[[0], ...]  # torch.Size([1, HW, C])
         cond_ca_output = hidden_states[1:, ...].unsqueeze(0)  # torch.Size([1, 1+instance_num, 5, 64, 1280])
-        guidance_masks = []
-        in_box = []
-        # Construct Instance Guidance Mask
-        for bbox in bboxes:
-            guidance_mask = np.zeros((height, width))
-            w_min = int(width * bbox[0])
-            w_max = int(width * bbox[2])
-            h_min = int(height * bbox[1])
-            h_max = int(height * bbox[3])
-            guidance_mask[h_min:h_max, w_min:w_max] = 1.0
-            guidance_masks.append(guidance_mask[None, ...])
-            in_box.append([bbox[0], bbox[2], bbox[1], bbox[3]])
-        # Construct Background Guidance Mask
-        sup_mask = get_sup_mask(guidance_masks)
-        supplement_mask = torch.from_numpy(sup_mask[None, ...])
-        supplement_mask = F.interpolate(supplement_mask, (height // 8, width // 8), mode="bilinear").float()
-        supplement_mask = supplement_mask.to(hidden_states.device)  # (1, 1, H, W)
-        guidance_masks = np.concatenate(guidance_masks, axis=0)
-        guidance_masks = guidance_masks[None, ...]
-        guidance_masks = torch.from_numpy(guidance_masks).float().to(cond_ca_output.device)
-        guidance_masks = F.interpolate(
-            guidance_masks, (height // 8, width // 8), mode="bilinear"
-        )  # (1, instance_num, H, W)
-        in_box = torch.from_numpy(np.array(in_box))[None, ...].float().to(cond_ca_output.device)  # (1, instance_num, 4)
         other_info = {}
         other_info["image_token"] = hidden_states_cond[None, ...]
-        other_info["context"] = encoder_hidden_states[1:, ...]
         other_info["box"] = in_box
         other_info["context_pooler"] = embeds_pooler[:, None, :]  # (instance_num, 1, 768)
         other_info["supplement_mask"] = supplement_mask
-        other_info["attn2"] = None
-        other_info["attn"] = attn
         other_info["height"] = height
         other_info["width"] = width
         other_info["ca_scale"] = ca_scale
@@ -326,7 +344,7 @@ class StableDiffusionMIGCPipeline(
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
@@ -419,7 +437,11 @@ class StableDiffusionMIGCPipeline(
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
-        # self.embedding = {}
     def _register_migc_adapters(self, unet: UNet2DConditionModel):
         for name, module in unet.named_modules():
@@ -448,7 +470,7 @@ class StableDiffusionMIGCPipeline(
         device,
         num_images_per_prompt,
         do_classifier_free_guidance,
-        negative_prompt=None,
         prompt_embeds: Optional[torch.Tensor] = None,
         negative_prompt_embeds: Optional[torch.Tensor] = None,
         pooled_prompt_embeds: Optional[torch.Tensor] = None,
@@ -881,24 +903,19 @@ class StableDiffusionMIGCPipeline(
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         MIGCsteps=20,
         NaiveFuserSteps=-1,
-        ca_scale=None,
-        ea_scale=None,
-        sac_scale=None,
-        aug_phase_with_and=False,
-        sa_preserve=False,
-        use_sa_preserve=False,
         **kwargs,
     ):
         r"""
         The call function to the pipeline for generation.
         Args:
-            prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
-            token_indices (Union[List[List[List[int]]], List[List[int]]], optional):
-                The list of the indexes in the prompt to layout. Defaults to None.
-            bboxes (Union[List[List[List[float]]], List[List[float]]], optional):
                 The bounding boxes of the indexes to maintain layout in the image. Defaults to None.
             height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The height in pixels of the generated image.
@@ -1037,7 +1054,7 @@ class StableDiffusionMIGCPipeline(
         else:
             batch_size = prompt_embeds.shape[0]
         if batch_size > 1:
-            raise NotImplementedError("Batch processing is not supported.")
         device = self._execution_device
@@ -1067,6 +1084,7 @@ class StableDiffusionMIGCPipeline(
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
         if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
         # 4. Prepare timesteps
@@ -1087,6 +1105,37 @@ class StableDiffusionMIGCPipeline(
             latents,
         )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -1098,20 +1147,6 @@ class StableDiffusionMIGCPipeline(
                 guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
             ).to(device=device, dtype=latents.dtype)
-        # 6.2 prepare MIGC guidance_mask
-        guidance_mask = np.full((4, height // 8, width // 8), 1.0)
-        for bbox in bboxes:
-            w_min = max(0, int(width * bbox[0] // 8) - 5)
-            w_max = min(width, int(width * bbox[2] // 8) + 5)
-            h_min = max(0, int(height * bbox[1] // 8) - 5)
-            h_max = min(height, int(height * bbox[3] // 8) + 5)
-            guidance_mask[:, h_min:h_max, w_min:w_max] = 0
-        kernal_size = 5
-        guidance_mask = uniform_filter(guidance_mask, axes=(1, 2), size=kernal_size)
-        guidance_mask = torch.from_numpy(guidance_mask).to(self.device).unsqueeze(0)
         # 7. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
@@ -1139,6 +1174,9 @@ class StableDiffusionMIGCPipeline(
                     "ca_scale": ca_scale,
                     "ea_scale": ea_scale,
                     "sac_scale": sac_scale,
                 }
                 noise_pred = self.unet(

 from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from diffusers.utils.torch_utils import randn_tensor
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from core.diffusion.migc.migc_archs import MIGC, NaiveFuser
 logger = logging.get_logger(__name__)
         hidden_states: torch.Tensor,
         encoder_hidden_states: torch.Tensor | None = None,
         attention_mask: torch.Tensor | None = None,
+        temb: torch.Tensor | None = None,
+        encoder_hidden_states_phrases: torch.Tensor | None = None,
         bboxes: List[List[float]] = [],
         ith: int = 0,
         embeds_pooler: torch.Tensor | None = None,
         ca_scale: float | None = None,
         ea_scale: float | None = None,
         sac_scale: float | None = None,
+        guidance_masks: torch.Tensor | None = None,
+        supplement_mask: torch.Tensor | None = None,
+        in_box: torch.Tensor | None = None,
     ):
         batch_size, sequence_length, _ = hidden_states.shape
         assert batch_size == 1 or batch_size == 2, (
             "We currently only implement sampling with batch_size=1, and we will implement sampling with batch_size=N as soon as possible."
         )
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        ##########
+        # Expand encoder_hidden_states with encoder_hidden_states_phrases
         instance_num = len(bboxes)
         if ith > MIGCsteps:
         is_cross = encoder_hidden_states is not None
         # In this case, we need to use MIGC or naive_fuser, so
         # 1. We concat prompt embeds and phrases embeds
         # 2. we copy the hidden_states_cond (instance_num+1) times for QKV
         if is_cross and not is_vanilla_cross:
+            batch_size_phrases = encoder_hidden_states_phrases.shape[0]
             encoder_hidden_states = torch.cat([encoder_hidden_states, encoder_hidden_states_phrases])
             # print(encoder_hidden_states.shape)
             hidden_states_uncond = hidden_states[[0], ...]
             hidden_states_cond = hidden_states[[1], ...].repeat(instance_num + 1, 1, 1)
             hidden_states = torch.cat([hidden_states_uncond, hidden_states_cond])
+        else:
+            batch_size_phrases = 0
+        ##########
         query = attn.to_q(hidden_states)
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size + batch_size_phrases, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size + batch_size_phrases, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size + batch_size_phrases, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
         hidden_states = F.scaled_dot_product_attention(
             query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
         )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size + batch_size_phrases, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
         # linear proj
         hidden_states = attn.to_out[0](hidden_states)
         # dropout
         hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size + batch_size_phrases, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
         ###### Self-Attention Results ######
         if not is_cross:
             return hidden_states
         # hidden_states: torch.Size([1+1+instance_num, HW, C]), the first 1 is the uncond ca output, the second 1 is the global ca output.
         hidden_states_uncond = hidden_states[[0], ...]  # torch.Size([1, HW, C])
         cond_ca_output = hidden_states[1:, ...].unsqueeze(0)  # torch.Size([1, 1+instance_num, 5, 64, 1280])
         other_info = {}
         other_info["image_token"] = hidden_states_cond[None, ...]
         other_info["box"] = in_box
         other_info["context_pooler"] = embeds_pooler[:, None, :]  # (instance_num, 1, 768)
         other_info["supplement_mask"] = supplement_mask
         other_info["height"] = height
         other_info["width"] = width
         other_info["ca_scale"] = ca_scale
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection | None = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.default_sample_size = (
+            self.unet.config.sample_size
+            if hasattr(self, "unet") and self.unet is not None and hasattr(self.unet.config, "sample_size")
+            else 64
+        )
     def _register_migc_adapters(self, unet: UNet2DConditionModel):
         for name, module in unet.named_modules():
         device,
         num_images_per_prompt,
         do_classifier_free_guidance,
+        negative_prompt: str | List[str] | None = None,
         prompt_embeds: Optional[torch.Tensor] = None,
         negative_prompt_embeds: Optional[torch.Tensor] = None,
         pooled_prompt_embeds: Optional[torch.Tensor] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         MIGCsteps=20,
         NaiveFuserSteps=-1,
+        ca_scale: float | None = None,
+        ea_scale: float | None = None,
+        sac_scale: float | None = None,
         **kwargs,
     ):
         r"""
         The call function to the pipeline for generation.
         Args:
+            prompt (`str`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
+            bboxes (List[List[float]]], optional):
                 The bounding boxes of the indexes to maintain layout in the image. Defaults to None.
             height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The height in pixels of the generated image.
         else:
             batch_size = prompt_embeds.shape[0]
         if batch_size > 1:
+            raise NotImplementedError("Batch processing is not supported yet.")
         device = self._execution_device
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
         if self.do_classifier_free_guidance:
+            assert isinstance(negative_prompt_embeds, torch.Tensor)
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
         # 4. Prepare timesteps
             latents,
         )
+        # 5.1 Prepare guidance masks
+        guidance_masks = []
+        in_box = []
+        # Construct Instance Guidance Mask
+        for bbox in bboxes:
+            guidance_mask = np.zeros((height, width))
+            w_min = int(width * bbox[0])
+            w_max = int(width * bbox[2])
+            h_min = int(height * bbox[1])
+            h_max = int(height * bbox[3])
+            guidance_mask[h_min:h_max, w_min:w_max] = 1.0
+            guidance_masks.append(guidance_mask[None, ...])
+            in_box.append([bbox[0], bbox[2], bbox[1], bbox[3]])
+        # Construct Background Guidance Mask
+        sup_mask = get_sup_mask(guidance_masks)
+        supplement_mask = torch.from_numpy(sup_mask[None, ...])
+        supplement_mask = F.interpolate(supplement_mask, (height // 8, width // 8), mode="bilinear")
+        supplement_mask = supplement_mask.to(device=device, dtype=self.unet.dtype)  # (1, 1, H, W)
+        guidance_masks = np.concatenate(guidance_masks, axis=0)
+        guidance_masks = guidance_masks[None, ...]
+        guidance_masks = torch.from_numpy(guidance_masks).to(device=device, dtype=self.unet.dtype)
+        guidance_masks = F.interpolate(
+            guidance_masks, (height // 8, width // 8), mode="bilinear"
+        )  # (1, instance_num, H, W)
+        in_box = torch.from_numpy(np.array(in_box))[None, ...].to(
+            device=device, dtype=self.unet.dtype
+        )  # (1, instance_num, 4)
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
                 guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
             ).to(device=device, dtype=latents.dtype)
         # 7. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
                     "ca_scale": ca_scale,
                     "ea_scale": ea_scale,
                     "sac_scale": sac_scale,
+                    "guidance_masks": guidance_masks,
+                    "supplement_mask": supplement_mask,
+                    "in_box": in_box,
                 }
                 noise_pred = self.unet(