jimmycarter
/

LibreFLUX

@@ -1611,18 +1611,33 @@ class CustomPipeline(DiffusionPipeline, SD3LoraLoaderMixin):
                 prompt_mask_input = prompt_mask
                 latent_model_input = latents
                 if do_batch_cfg and guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
                     # Concatenate prompt embeddings
                     prompt_embeds_input = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
                     pooled_prompt_embeds_input = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
-                    # # Concatenate text IDs if they are used
-                    if text_ids is not None and negative_text_ids is not None:
-                        text_ids_input = torch.cat([negative_text_ids, text_ids], dim=0)
                     # Concatenate latent image IDs if they are used
-                    if latent_image_ids is not None:
-                        latent_image_ids_input = torch.cat([latent_image_ids, latent_image_ids], dim=0)
                     # Concatenate prompt masks if they are used
                     if prompt_mask is not None and negative_mask is not None:
@@ -1643,37 +1658,22 @@ class CustomPipeline(DiffusionPipeline, SD3LoraLoaderMixin):
                 # Prepare extra transformer arguments
                 extra_transformer_args = {}
                 if prompt_mask is not None:
-                    extra_transformer_args["attention_mask"] = prompt_mask_input.to(device=self.transformer.device)
                 # Forward pass through the transformer
                 noise_pred = self.transformer(
-                    hidden_states=latent_model_input.to(device=self.transformer.device),
                     timestep=timestep / 1000,
                     guidance=guidance,
-                    pooled_projections=pooled_prompt_embeds_input.to(device=self.transformer.device),
-                    encoder_hidden_states=prompt_embeds_input.to(device=self.transformer.device),
-                    txt_ids=text_ids_input.to(device=self.transformer.device) if text_ids is not None else None,
-                    img_ids=latent_image_ids_input.to(device=self.transformer.device) if latent_image_ids is not None else None,
                     joint_attention_kwargs=self.joint_attention_kwargs,
                     return_dict=False,
                     **extra_transformer_args,
                 )[0]
-                if do_batch_cfg and guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
-                    progress_bar.set_postfix(
-                        {
-                            'ts': timestep.detach().item() / 1000,
-                            'cfg': self._guidance_scale_real,
-                        },
-                    )
-                else:
-                    progress_bar.set_postfix(
-                        {
-                            'ts': timestep.detach().item() / 1000,
-                            'cfg': 'N/A',
-                        },
-                    )
                 # Apply real CFG
                 if guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
                     if do_batch_cfg:

                 prompt_mask_input = prompt_mask
                 latent_model_input = latents
+                if guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
+                    progress_bar.set_postfix(
+                        {
+                            'ts': timestep.detach().item() / 1000,
+                            'cfg': self._guidance_scale_real,
+                        },
+                    )
+                else:
+                    progress_bar.set_postfix(
+                        {
+                            'ts': timestep.detach().item() / 1000,
+                            'cfg': 'N/A',
+                        },
+                    )
                 if do_batch_cfg and guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
                     # Concatenate prompt embeddings
                     prompt_embeds_input = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
                     pooled_prompt_embeds_input = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+                    # Concatenate text IDs if they are used
+                    # if text_ids is not None and negative_text_ids is not None:
+                    #     text_ids_input = torch.cat([negative_text_ids, text_ids], dim=0)
                     # Concatenate latent image IDs if they are used
+                    # if latent_image_ids is not None:
+                    #     latent_image_ids_input = torch.cat([latent_image_ids, latent_image_ids], dim=0)
                     # Concatenate prompt masks if they are used
                     if prompt_mask is not None and negative_mask is not None:
                 # Prepare extra transformer arguments
                 extra_transformer_args = {}
                 if prompt_mask is not None:
+                    extra_transformer_args["attention_mask"] = prompt_mask_input.to(device=self.transformer.device).contiguous()
                 # Forward pass through the transformer
                 noise_pred = self.transformer(
+                    hidden_states=latent_model_input.to(device=self.transformer.device).contiguous() ,
                     timestep=timestep / 1000,
                     guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds_input.to(device=self.transformer.device).contiguous() ,
+                    encoder_hidden_states=prompt_embeds_input.to(device=self.transformer.device).contiguous() ,
+                    txt_ids=text_ids_input.to(device=self.transformer.device).contiguous() if text_ids is not None else None,
+                    img_ids=latent_image_ids_input.to(device=self.transformer.device).contiguous() if latent_image_ids is not None else None,
                     joint_attention_kwargs=self.joint_attention_kwargs,
                     return_dict=False,
                     **extra_transformer_args,
                 )[0]
                 # Apply real CFG
                 if guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
                     if do_batch_cfg: