AlanB
/

lpw_stable_diffusion_mod

Model card Files Files and versions

xet

Community

AlanB commited on Jan 11, 2023

Commit

69e0e4e

1 Parent(s): 0ca23a8

Updated prompt tweening, trying to fix error "Input type (float) and bias type (c10::Half) should be the same"

Browse files

Files changed (1) hide show

pipeline.py +32 -12

pipeline.py CHANGED Viewed

@@ -1167,6 +1167,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
     # Borrowed from https://github.com/csaluski/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
     def get_text_latent_space(self, prompt, guidance_scale = 7.5):
         # get prompt text embeddings
         text_input = self.tokenizer(
             prompt,
@@ -1176,7 +1177,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
             return_tensors="pt",
         )
         text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
@@ -1195,7 +1196,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
             text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
         return text_embeddings
     def slerp(self, t, v0, v1, DOT_THRESHOLD=0.9995):
         """ helper function to spherically interpolate two arrays v1 v2
         from https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355
@@ -1292,11 +1293,11 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
         eta: Optional[float] = 0.0,
         generator: Optional[torch.Generator] = None,
         output_type: Optional[str] = "pil",
         **kwargs,):
-        from diffusers.schedulers import LMSDiscreteScheduler
         batch_size = 1
         if generator == None:
             generator = torch.Generator("cuda")
         generator_state = generator.get_state()
@@ -1330,8 +1331,27 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         for i, t in tqdm(enumerate(self.scheduler.timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             if isinstance(self.scheduler, LMSDiscreteScheduler):
@@ -1339,7 +1359,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
                 latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
             # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
             # perform guidance
             if do_classifier_free_guidance:
@@ -1348,21 +1368,21 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
             # compute the previous noisy sample x_t -> x_t-1
             if isinstance(self.scheduler, LMSDiscreteScheduler):
-                latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs).prev_sample
             else:
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
         # scale and decode the image latents with vae
         latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents)
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()
         if output_type == "pil":
             image = self.numpy_to_pil(image)
-        return {"image": image, "generator_state": generator_state}
     def variation(self, text_embeddings, generator_state, variation_magnitude = 100, **kwargs):
         # random vector to move in latent space
@@ -1370,7 +1390,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
         rand_mag = torch.sum(torch.abs(rand_t)) / variation_magnitude
         scaled_rand_t = rand_t / rand_mag
         variation_embedding = text_embeddings + scaled_rand_t
         generator = torch.Generator("cuda")
         generator.set_state(generator_state)
         result = self.diffuse_from_inits(variation_embedding, generator=generator, **kwargs)

     # Borrowed from https://github.com/csaluski/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
     def get_text_latent_space(self, prompt, guidance_scale = 7.5):
         # get prompt text embeddings
         text_input = self.tokenizer(
             prompt,
             return_tensors="pt",
         )
         text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
             text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
         return text_embeddings
     def slerp(self, t, v0, v1, DOT_THRESHOLD=0.9995):
         """ helper function to spherically interpolate two arrays v1 v2
         from https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355
         eta: Optional[float] = 0.0,
         generator: Optional[torch.Generator] = None,
         output_type: Optional[str] = "pil",
+        save_n_steps: Optional[int] = None,
         **kwargs,):
         batch_size = 1
         if generator == None:
             generator = torch.Generator("cuda")
         generator_state = generator.get_state()
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
+        if save_n_steps:
+            mid_latents = []
+            mid_images = []
+        else:
+            mid_latents = None
+            mid_images = None
         for i, t in tqdm(enumerate(self.scheduler.timesteps)):
+            if save_n_steps:
+                if i % save_n_steps == 0:
+                    # scale and decode the image latents with vae
+                    dec_mid_latents = 1 / 0.18215 * latents
+                    mid_latents.append(dec_mid_latents)
+                    image = self.vae.decode(dec_mid_latents).sample
+                    image = (image / 2 + 0.5).clamp(0, 1)
+                    image = image.cpu().permute(0, 2, 3, 1).numpy()
+                    if output_type == "pil":
+                        image = self.numpy_to_pil(image)
+                    mid_latents.append(image)
+                    mid_images.append(image)
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             if isinstance(self.scheduler, LMSDiscreteScheduler):
                 latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
             # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
             # perform guidance
             if do_classifier_free_guidance:
             # compute the previous noisy sample x_t -> x_t-1
             if isinstance(self.scheduler, LMSDiscreteScheduler):
+                latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs)["prev_sample"]
             else:
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)["prev_sample"]
         # scale and decode the image latents with vae
         latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()
         if output_type == "pil":
             image = self.numpy_to_pil(image)
+        return {"image": image, "generator_state": generator_state, "mid_latents": mid_latents, "mid_images": mid_images}
     def variation(self, text_embeddings, generator_state, variation_magnitude = 100, **kwargs):
         # random vector to move in latent space
         rand_mag = torch.sum(torch.abs(rand_t)) / variation_magnitude
         scaled_rand_t = rand_t / rand_mag
         variation_embedding = text_embeddings + scaled_rand_t
         generator = torch.Generator("cuda")
         generator.set_state(generator_state)
         result = self.diffuse_from_inits(variation_embedding, generator=generator, **kwargs)