Update pipeline.py
Browse files- pipeline.py +27 -52
pipeline.py
CHANGED
|
@@ -965,32 +965,19 @@ class LibreFluxIpAdapterPipeline(DiffusionPipeline, SD3LoraLoaderMixin):
|
|
| 965 |
div_timestep = (timestep / 1000.0)
|
| 966 |
text_ids = [ t for t in text_ids ]
|
| 967 |
|
| 968 |
-
|
| 969 |
-
|
| 970 |
-
|
| 971 |
-
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
-
|
| 977 |
-
|
| 978 |
-
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
noise_pred = self.ip_adapter(
|
| 982 |
-
image_embeds_input,
|
| 983 |
-
latent_model_input.to(device=self.transformer.device),
|
| 984 |
-
layer_scale=layer_scale_input,
|
| 985 |
-
timestep=div_timestep.to(device=self.transformer.device),
|
| 986 |
-
guidance=guidance,
|
| 987 |
-
pooled_projections=pooled_prompt_embeds_input.to(device=self.transformer.device),
|
| 988 |
-
encoder_hidden_states=prompt_embeds_input.to(device=self.transformer.device),
|
| 989 |
-
attention_mask=prompt_mask_input.to(device=self.transformer.device),
|
| 990 |
-
txt_ids=text_ids_input[0],
|
| 991 |
-
img_ids=latent_image_ids_input[0].to(device=self.transformer.device),
|
| 992 |
-
return_dict=False,
|
| 993 |
-
)[0]
|
| 994 |
|
| 995 |
# Apply real CFG
|
| 996 |
if guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
|
|
@@ -1000,32 +987,20 @@ class LibreFluxIpAdapterPipeline(DiffusionPipeline, SD3LoraLoaderMixin):
|
|
| 1000 |
noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred_cond - noise_pred_uncond)
|
| 1001 |
else:
|
| 1002 |
# Sequential CFG: Compute unconditional noise prediction separately
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
-
|
| 1008 |
-
|
| 1009 |
-
|
| 1010 |
-
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
-
|
| 1014 |
-
|
| 1015 |
-
|
| 1016 |
-
|
| 1017 |
-
image_embeds,
|
| 1018 |
-
latents.to(device=self.transformer.device),
|
| 1019 |
-
layer_scale=neg_layer_scale,
|
| 1020 |
-
timestep=div_timestep,
|
| 1021 |
-
guidance=guidance,
|
| 1022 |
-
pooled_projections=negative_pooled_prompt_embeds.to(device=self.transformer.device),
|
| 1023 |
-
encoder_hidden_states=negative_prompt_embeds.to(device=self.transformer.device),
|
| 1024 |
-
attention_mask=negative_mask,
|
| 1025 |
-
txt_ids=negative_text_ids.to(device=self.transformer.device) if negative_text_ids is not None else None,
|
| 1026 |
-
img_ids=latent_image_ids[0].to(device=self.transformer.device),
|
| 1027 |
-
return_dict=False,
|
| 1028 |
-
)[0]
|
| 1029 |
|
| 1030 |
# Combine conditional and unconditional predictions
|
| 1031 |
noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred - noise_pred_uncond)
|
|
|
|
| 965 |
div_timestep = (timestep / 1000.0)
|
| 966 |
text_ids = [ t for t in text_ids ]
|
| 967 |
|
| 968 |
+
noise_pred = self.ip_adapter(
|
| 969 |
+
image_embeds_input,
|
| 970 |
+
latent_model_input.to(device=self.transformer.device),
|
| 971 |
+
layer_scale=layer_scale_input,
|
| 972 |
+
timestep=div_timestep.to(device=self.transformer.device),
|
| 973 |
+
guidance=guidance,
|
| 974 |
+
pooled_projections=pooled_prompt_embeds_input.to(device=self.transformer.device),
|
| 975 |
+
encoder_hidden_states=prompt_embeds_input.to(device=self.transformer.device),
|
| 976 |
+
attention_mask=prompt_mask_input.to(device=self.transformer.device),
|
| 977 |
+
txt_ids=text_ids_input[0],
|
| 978 |
+
img_ids=latent_image_ids_input[0].to(device=self.transformer.device),
|
| 979 |
+
return_dict=False,
|
| 980 |
+
)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 981 |
|
| 982 |
# Apply real CFG
|
| 983 |
if guidance_scale_real > 1.0 and i >= no_cfg_until_timestep:
|
|
|
|
| 987 |
noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred_cond - noise_pred_uncond)
|
| 988 |
else:
|
| 989 |
# Sequential CFG: Compute unconditional noise prediction separately
|
| 990 |
+
|
| 991 |
+
noise_pred_uncond = self.ip_adapter(
|
| 992 |
+
image_embeds,
|
| 993 |
+
latents.to(device=self.transformer.device),
|
| 994 |
+
layer_scale=neg_layer_scale,
|
| 995 |
+
timestep=div_timestep,
|
| 996 |
+
guidance=guidance,
|
| 997 |
+
pooled_projections=negative_pooled_prompt_embeds.to(device=self.transformer.device),
|
| 998 |
+
encoder_hidden_states=negative_prompt_embeds.to(device=self.transformer.device),
|
| 999 |
+
attention_mask=negative_mask,
|
| 1000 |
+
txt_ids=negative_text_ids.to(device=self.transformer.device) if negative_text_ids is not None else None,
|
| 1001 |
+
img_ids=latent_image_ids[0].to(device=self.transformer.device),
|
| 1002 |
+
return_dict=False,
|
| 1003 |
+
)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1004 |
|
| 1005 |
# Combine conditional and unconditional predictions
|
| 1006 |
noise_pred = noise_pred_uncond + guidance_scale_real * (noise_pred - noise_pred_uncond)
|