Spaces:

mattb512
/

fastai-lesson-10-diffusers

Runtime error

App Files Files Community

mattb512 commited on Jan 26, 2024

Commit

e57df08

1 Parent(s): 29f02ac

fix image type and float size

Browse files

Files changed (2) hide show

app.py +2 -2
image_generator.py +18 -13

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ print(ig)
 ig.load_models()
 ig.load_scheduler()
-def greet(prompt, mix_prompt, mix_ratio, negative_prompt, steps, init_image ):
     print(f"{prompt=} {mix_prompt=} {mix_ratio=} {negative_prompt=} {steps=} {init_image=} ")
     generated_image, latents = ig.generate(
@@ -26,7 +26,7 @@ def greet(prompt, mix_prompt, mix_ratio, negative_prompt, steps, init_image ):
     return generated_image, noisy_latent
 iface = gr.Interface(
-    fn=greet,
     inputs=[
         gr.Textbox(value="a cute dog", label="Prompt", info="primary prompt used to generate an image"),
         gr.Textbox(value=None, label="Secondary Prompt",  info="secondary prompt to mix with the primary embeddings"),

 ig.load_models()
 ig.load_scheduler()
+def call(prompt, mix_prompt, mix_ratio, negative_prompt, steps, init_image ):
     print(f"{prompt=} {mix_prompt=} {mix_ratio=} {negative_prompt=} {steps=} {init_image=} ")
     generated_image, latents = ig.generate(
     return generated_image, noisy_latent
 iface = gr.Interface(
+    fn=call,
     inputs=[
         gr.Textbox(value="a cute dog", label="Prompt", info="primary prompt used to generate an image"),
         gr.Textbox(value=None, label="Secondary Prompt",  info="secondary prompt to mix with the primary embeddings"),

image_generator.py CHANGED Viewed

@@ -28,16 +28,22 @@ class ImageGenerator():
         self.height = 512
         self.generator = torch.manual_seed(32)
         self.bs = 1
     def __repr__(self):
         return f"Image Generator with {self.g=}"
     def load_models(self):
-        self.tokenizer    = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",        torch_dtype=torch.float16)
-        self.text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14",        torch_dtype=torch.float16                          ).to("cuda")
-        # vae             = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema",             torch_dtype=torch.float16                          ).to("cuda")
-        self.vae          = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4",                                    subfolder="vae"         ).to("cuda")
-        self.unet         = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4",                             subfolder="unet"        ).to("cuda") #torch_dtype=torch.float16,
     def load_scheduler( self,
                         beta_start : float=0.00085,
@@ -63,7 +69,7 @@ class ImageGenerator():
             np_images = np.repeat(np_img[np.newaxis, :, :], self.bs, axis=0) # adding a new dimension and repeating the image for each prompt
             # print(f"{np_images.shape=}")
-            decoded_latent = torch.from_numpy(np_images).to("cuda").float() #<-- stability-ai vae uses half(), compvis vae uses float?
             # print(f"{decoded_latent.shape=}")
             encoded_latent = 0.18215 * self.vae.encode(decoded_latent).latent_dist.sample()
@@ -75,7 +81,7 @@ class ImageGenerator():
         # noise = torch.randn_like(latent) # missing generator parameter
         noise = torch.randn(
                 size = (self.bs, self.unet.config.in_channels, self.height//8, self.width//8),
-                generator = self.generator).to("cuda")
         timesteps = torch.tensor([self.scheduler.timesteps[scheduler_steps]])
         noisy_latent = self.scheduler.add_noise(latent, noise, timesteps)
         # print(f"add_noise: {timesteps.shape=} {timesteps=} {noisy_latent.shape=}")
@@ -103,7 +109,7 @@ class ImageGenerator():
         if maxlen is None: maxlen = self.tokenizer.model_max_length
         inp = self.tokenizer([prompt], padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt")
-        return self.text_encoder(inp.input_ids.to("cuda"))[0].float()
     def tensor_to_pil(self, t:torch.Tensor) -> Image:
         '''transforms a tensor decoded by the vae to a pil image'''
@@ -126,7 +132,7 @@ class ImageGenerator():
                  seed : int=32,
                  steps : int=30,
                  start_step_ratio : float=1/5,
-                 init_image : str=None,
                  latent_callback_mod : int=10):
         self.latent_images = []
         if not negative_prompt: negative_prompt = ""
@@ -153,13 +159,12 @@ class ImageGenerator():
         else:
             start_steps = int(steps * start_step_ratio) # 0%: too much noise, 100% no noise
             # print(f"{start_steps=}")
-            img = self.load_image(init_image)
-            latents =self. pil_to_latent(img)
             self.latent_callback(latents)
-            latents = self.add_noise(latents, start_steps).to("cuda").float()
             self.latent_callback(latents)
-        latents = latents.to("cuda").float()
         for i,ts in enumerate(tqdm(self.scheduler.timesteps, leave=False)):
             if i >= start_steps:

         self.height = 512
         self.generator = torch.manual_seed(32)
         self.bs = 1
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+            self.float_size = torch.float16
+        else:
+            self.device = torch.device("cpu")
+            self.float_size = torch.float32
     def __repr__(self):
         return f"Image Generator with {self.g=}"
     def load_models(self):
+        self.tokenizer    = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=self.float_size)
+        self.text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=self.float_size).to( self.device)
+        # vae             = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema",             torch_dtype=torch.float16                          ).to(self.device)
+        self.vae          = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4",   subfolder="vae").to( self.device)
+        self.unet         = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet" ).to( self.device) #torch_dtype=torch.float16,
     def load_scheduler( self,
                         beta_start : float=0.00085,
             np_images = np.repeat(np_img[np.newaxis, :, :], self.bs, axis=0) # adding a new dimension and repeating the image for each prompt
             # print(f"{np_images.shape=}")
+            decoded_latent = torch.from_numpy(np_images).to(self.device).float() #<-- stability-ai vae uses half(), compvis vae uses float?
             # print(f"{decoded_latent.shape=}")
             encoded_latent = 0.18215 * self.vae.encode(decoded_latent).latent_dist.sample()
         # noise = torch.randn_like(latent) # missing generator parameter
         noise = torch.randn(
                 size = (self.bs, self.unet.config.in_channels, self.height//8, self.width//8),
+                generator = self.generator).to(self.device)
         timesteps = torch.tensor([self.scheduler.timesteps[scheduler_steps]])
         noisy_latent = self.scheduler.add_noise(latent, noise, timesteps)
         # print(f"add_noise: {timesteps.shape=} {timesteps=} {noisy_latent.shape=}")
         if maxlen is None: maxlen = self.tokenizer.model_max_length
         inp = self.tokenizer([prompt], padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt")
+        return self.text_encoder(inp.input_ids.to(self.device))[0].float()
     def tensor_to_pil(self, t:torch.Tensor) -> Image:
         '''transforms a tensor decoded by the vae to a pil image'''
                  seed : int=32,
                  steps : int=30,
                  start_step_ratio : float=1/5,
+                 init_image : Image=None,
                  latent_callback_mod : int=10):
         self.latent_images = []
         if not negative_prompt: negative_prompt = ""
         else:
             start_steps = int(steps * start_step_ratio) # 0%: too much noise, 100% no noise
             # print(f"{start_steps=}")
+            latents =self. pil_to_latent(init_image)
             self.latent_callback(latents)
+            latents = self.add_noise(latents, start_steps).to(self.device).float()
             self.latent_callback(latents)
+        latents = latents.to(self.device).float()
         for i,ts in enumerate(tqdm(self.scheduler.timesteps, leave=False)):
             if i >= start_steps: