Text-to-Image
Diffusers
Safetensors
LibreFluxIPAdapterPipeline
neuralvfx commited on
Commit
f9471c4
·
verified ·
1 Parent(s): 4478860

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +27 -52
pipeline.py CHANGED
@@ -965,32 +965,19 @@ class LibreFluxIpAdapterPipeline(DiffusionPipeline, SD3LoraLoaderMixin):
965
  div_timestep = (timestep / 1000.0)
966
  text_ids = [ t for t in text_ids ]
967
 
968
- if not self.ip_loaded:
969
- noise_pred = self.transformer(
970
- latent_model_input,
971
- timestep=div_timestep.to(device=self.transformer.device),
972
- guidance=guidance,
973
- pooled_projections=pooled_prompt_embeds_input.to(device=self.transformer.device),
974
- encoder_hidden_states=prompt_embeds_input.to(device=self.transformer.device),
975
- attention_mask=prompt_mask_input.to(device=self.transformer.device),
976
- txt_ids=text_ids_input[0],
977
- img_ids=latent_image_ids_input[0].to(device=self.transformer.device),
978
- return_dict=False,
979
- )[0]
980
- else:
981
- noise_pred = self.ip_adapter(
982
- image_embeds_input,
983
- latent_model_input.to(device=self.transformer.device),
984
- layer_scale=layer_scale_input,
985
- timestep=div_timestep.to(device=self.transformer.device),
986
- guidance=guidance,
987
- pooled_projections=pooled_prompt_embeds_input.to(device=self.transformer.device),
988
- encoder_hidden_states=prompt_embeds_input.to(device=self.transformer.device),
989
- attention_mask=prompt_mask_input.to(device=self.transformer.device),
990
- txt_ids=text_ids_input[0],
991
- img_ids=latent_image_ids_input[0].to(device=self.transformer.device),
992
- return_dict=False,
993
- )[0]
994
 
995
  # Apply real CFG
996
  if guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
@@ -1000,32 +987,20 @@ class LibreFluxIpAdapterPipeline(DiffusionPipeline, SD3LoraLoaderMixin):
1000
  noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred_cond - noise_pred_uncond)
1001
  else:
1002
  # Sequential CFG: Compute unconditional noise prediction separately
1003
- if not self.ip_loaded:
1004
- noise_pred_uncond = self.transformer(
1005
- latents.to(device=self.transformer.device),
1006
- timestep=div_timestep,
1007
- guidance=guidance,
1008
- pooled_projections=negative_pooled_prompt_embeds.to(device=self.transformer.device),
1009
- encoder_hidden_states=negative_prompt_embeds.to(device=self.transformer.device),
1010
- attention_mask=negative_mask,
1011
- txt_ids=negative_text_ids.to(device=self.transformer.device) if negative_text_ids is not None else None,
1012
- img_ids=latent_image_ids[0].to(device=self.transformer.device),
1013
- return_dict=False,
1014
- )[0]
1015
- else:
1016
- noise_pred_uncond = self.ip_adapter(
1017
- image_embeds,
1018
- latents.to(device=self.transformer.device),
1019
- layer_scale=neg_layer_scale,
1020
- timestep=div_timestep,
1021
- guidance=guidance,
1022
- pooled_projections=negative_pooled_prompt_embeds.to(device=self.transformer.device),
1023
- encoder_hidden_states=negative_prompt_embeds.to(device=self.transformer.device),
1024
- attention_mask=negative_mask,
1025
- txt_ids=negative_text_ids.to(device=self.transformer.device) if negative_text_ids is not None else None,
1026
- img_ids=latent_image_ids[0].to(device=self.transformer.device),
1027
- return_dict=False,
1028
- )[0]
1029
 
1030
  # Combine conditional and unconditional predictions
1031
  noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred - noise_pred_uncond)
 
965
  div_timestep = (timestep / 1000.0)
966
  text_ids = [ t for t in text_ids ]
967
 
968
+ noise_pred = self.ip_adapter(
969
+ image_embeds_input,
970
+ latent_model_input.to(device=self.transformer.device),
971
+ layer_scale=layer_scale_input,
972
+ timestep=div_timestep.to(device=self.transformer.device),
973
+ guidance=guidance,
974
+ pooled_projections=pooled_prompt_embeds_input.to(device=self.transformer.device),
975
+ encoder_hidden_states=prompt_embeds_input.to(device=self.transformer.device),
976
+ attention_mask=prompt_mask_input.to(device=self.transformer.device),
977
+ txt_ids=text_ids_input[0],
978
+ img_ids=latent_image_ids_input[0].to(device=self.transformer.device),
979
+ return_dict=False,
980
+ )[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
981
 
982
  # Apply real CFG
983
  if guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
 
987
  noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred_cond - noise_pred_uncond)
988
  else:
989
  # Sequential CFG: Compute unconditional noise prediction separately
990
+
991
+ noise_pred_uncond = self.ip_adapter(
992
+ image_embeds,
993
+ latents.to(device=self.transformer.device),
994
+ layer_scale=neg_layer_scale,
995
+ timestep=div_timestep,
996
+ guidance=guidance,
997
+ pooled_projections=negative_pooled_prompt_embeds.to(device=self.transformer.device),
998
+ encoder_hidden_states=negative_prompt_embeds.to(device=self.transformer.device),
999
+ attention_mask=negative_mask,
1000
+ txt_ids=negative_text_ids.to(device=self.transformer.device) if negative_text_ids is not None else None,
1001
+ img_ids=latent_image_ids[0].to(device=self.transformer.device),
1002
+ return_dict=False,
1003
+ )[0]
 
 
 
 
 
 
 
 
 
 
 
 
1004
 
1005
  # Combine conditional and unconditional predictions
1006
  noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred - noise_pred_uncond)