frutiemax
/

rct_model

Model card Files Files and versions

xet

Community

frutiemax commited on Oct 2, 2023

Commit

aa6b13c

1 Parent(s): f53657a

Use ImageVaeProcessor

Browse files

Files changed (3) hide show

rct_diffusion_pipeline.py +23 -13
test_pipeline.py +2 -2
train_model.py +7 -14

rct_diffusion_pipeline.py CHANGED Viewed

@@ -10,9 +10,10 @@ from datasets import load_dataset
 import numpy as np
 import pandas as pd
 from tqdm.auto import tqdm
 class RCTDiffusionPipeline(DiffusionPipeline):
-    def __init__(self, unet, scheduler, vae, text_tokenizer, text_encoder, latent_size=32, sample_size=256):
         super().__init__()
         # dictionnary that keeps the different classes of object description, color1, color2 and color3
@@ -29,6 +30,9 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         self.text_encoder = text_encoder
         self.text_tokenizer = text_tokenizer
         # channels for 1 image
         self.num_channels = int(self.unet.config.in_channels)
         self.load_dictionaries_from_dataset()
@@ -172,8 +176,7 @@ class RCTDiffusionPipeline(DiffusionPipeline):
     def generate_noise_batches(self, batch_size):
         noise_batches = torch.Tensor(size=(batch_size, self.num_channels, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
-        seed = int(0)
-        np.random.seed(seed)
         torch.manual_seed(seed)
         torch.cuda.manual_seed(seed)
         for batch_index in range(batch_size):
@@ -260,6 +263,7 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         # now call the model for the n interations
         progress_bar = tqdm(total=num_inference_steps)
         epoch = 0
         for t in self.scheduler.timesteps:
             progress_bar.set_description(f'Inference step {epoch}')
@@ -269,8 +273,14 @@ class RCTDiffusionPipeline(DiffusionPipeline):
                     noise_residual = self.unet(noise_batch, t, encoder_hidden_states=embeddings).sample
                 previous_noisy_sample = self.scheduler.step(noise_residual, t, noise_batch).prev_sample
                 noise_batches[batch_index] = previous_noisy_sample
             progress_bar.update(1)
             epoch = epoch + 1
         # reshape the data so we get back 4 RGB images
         noise_batches = torch.reshape(noise_batches, (batch_size, self.num_channels, self.latent_size, self.latent_size))
@@ -280,22 +290,22 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         with torch.no_grad():
             image = noise_batches
-            result = self.vae.decode(image).sample
-            images = result
-            images = images / self.vae.config.scaling_factor
         # convert those tensors to PIL images
         tensor_to_pil = T.ToPILImage()
         output_images = []
         for batch_index in range(batch_size):
             image = images[batch_index]
-            image = (image / 2 + 0.5).clamp(0, 1)
-            #image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
-            #image = (image * 255).round().astype("uint8")
-            #image = Image.fromarray(image)
-            image = tensor_to_pil(image)
-            image.save(f'test{batch_index}.png')
             output_images.append(image)
         # for now just return the images
-        return output_images

 import numpy as np
 import pandas as pd
 from tqdm.auto import tqdm
+from diffusers.image_processor import VaeImageProcessor
 class RCTDiffusionPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler, vae, text_tokenizer, text_encoder, vae_image_processor : VaeImageProcessor, latent_size=32, sample_size=256):
         super().__init__()
         # dictionnary that keeps the different classes of object description, color1, color2 and color3
         self.text_encoder = text_encoder
         self.text_tokenizer = text_tokenizer
+        # use vae image processor
+        self.vae_image_processor = vae_image_processor
         # channels for 1 image
         self.num_channels = int(self.unet.config.in_channels)
         self.load_dictionaries_from_dataset()
     def generate_noise_batches(self, batch_size):
         noise_batches = torch.Tensor(size=(batch_size, self.num_channels, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
+        seed = torch.seed()
         torch.manual_seed(seed)
         torch.cuda.manual_seed(seed)
         for batch_index in range(batch_size):
         # now call the model for the n interations
         progress_bar = tqdm(total=num_inference_steps)
         epoch = 0
+        test_image = None
         for t in self.scheduler.timesteps:
             progress_bar.set_description(f'Inference step {epoch}')
                     noise_residual = self.unet(noise_batch, t, encoder_hidden_states=embeddings).sample
                 previous_noisy_sample = self.scheduler.step(noise_residual, t, noise_batch).prev_sample
                 noise_batches[batch_index] = previous_noisy_sample
+                # test
+                test_image = self.decode_latent(noise_batches[batch_index], self.vae.config.scaling_factor)
             progress_bar.update(1)
             epoch = epoch + 1
+        test_image.show()
         # reshape the data so we get back 4 RGB images
         noise_batches = torch.reshape(noise_batches, (batch_size, self.num_channels, self.latent_size, self.latent_size))
         with torch.no_grad():
             image = noise_batches
+            result = self.vae.decode(image / self.vae.config.scaling_factor).sample
+            image = self.vae_image_processor.denormalize(result)
+            images = image
         # convert those tensors to PIL images
         tensor_to_pil = T.ToPILImage()
         output_images = []
         for batch_index in range(batch_size):
             image = images[batch_index]
             output_images.append(image)
         # for now just return the images
+        return [tensor_to_pil(image) for image in output_images]
+    def decode_latent(self, image, vae_scaling_factor) -> torch.Tensor:
+        tensor_to_pil = T.ToPILImage()
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = tensor_to_pil(image)
+        return image

test_pipeline.py CHANGED Viewed

@@ -39,8 +39,8 @@ vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors
 vae = vae.to('cuda', dtype=torch.float16)
 #pipeline = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder)
-pipeline = RCTDiffusionPipeline.from_pretrained('rct_foliage_999')
-output = pipeline(['pagoda pine tree'], ['green'], ['grey'])
 output[0].save('out.png')
 pipeline.save_pretrained('test')
 print('test')

 vae = vae.to('cuda', dtype=torch.float16)
 #pipeline = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder)
+pipeline = RCTDiffusionPipeline.from_pretrained('rct_foliage_249')
+output = pipeline(['(cabbage) pagoda tree'], ['(dark) green'], ['brown'])
 output[0].save('out.png')
 pipeline.save_pretrained('test')
 print('test')

train_model.py CHANGED Viewed

@@ -14,6 +14,7 @@ from accelerate import Accelerator
 from diffusers import DDPMScheduler, UNet2DConditionModel, AutoencoderKL
 from transformers import CLIPTextModel, CLIPTokenizer
 import torch.nn as nn
 SAMPLE_SIZE = 256
 LATENT_SIZE = 32
@@ -31,24 +32,13 @@ def save_and_test(pipeline, epoch):
     pipeline.save_pretrained(model_file)
 def transform_images(image):
-    res = torch.Tensor((SAMPLE_NUM_CHANNELS, SAMPLE_SIZE, SAMPLE_SIZE))
     pil_to_tensor = T.PILToTensor()
-    tensor_to_pil = T.ToPILImage()
-    res_index = 0
     scale_factor = np.minimum(SAMPLE_SIZE / image.width, SAMPLE_SIZE / image.height)
     image = Image.resize(image, size=(int(scale_factor * image.width), int(scale_factor * image.height)), resample=Resampling.NEAREST)
     new_image = PIL.Image.new('RGB', (SAMPLE_SIZE, SAMPLE_SIZE))
     new_image.paste(image, box=(int((SAMPLE_SIZE - image.width)/2), int((SAMPLE_SIZE - image.height)/2)))
-    #data = np.array(new_image, dtype=np.float32)
-    #data = (data / 128.0 - 1.0)
-    #res = torch.from_numpy(data)
-    res = pil_to_tensor(new_image)
-    res.to(dtype=torch.float32)
-    res = res / torch.Tensor([128.0]) - torch.Tensor([1.0])
-    return res
 def convert_images(dataset):
     images = [transform_images(image) for image in dataset["image"]]
@@ -101,6 +91,8 @@ def create_embeddings(dataset, model):
 def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timesteps=100, save_model_interval=10, start_learning_rate=1e-4, lr_warmup_steps=500):
     dataset = load_dataset('frutiemax/rct_dataset', split=f'train[0:{total_images}]')
     dataset.set_transform(convert_images)
     num_images = dataset.num_rows
@@ -133,7 +125,7 @@ def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timeste
         num_warmup_steps=lr_warmup_steps,
         num_training_steps=num_images * epochs
     )
-    model = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder)
     unet = unet.to('cuda')
     train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
@@ -154,6 +146,7 @@ def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timeste
                 to(device='cuda')
             # use the vae to get the latent images
             latent_images = vae.encode(clean_images).latent_dist.sample()
             latent_images = latent_images * vae.config.scaling_factor
@@ -192,4 +185,4 @@ def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timeste
 if __name__ == '__main__':
-    train_model(batch_size=16, save_model_interval=25, epochs=500, start_learning_rate=1e-5)

 from diffusers import DDPMScheduler, UNet2DConditionModel, AutoencoderKL
 from transformers import CLIPTextModel, CLIPTokenizer
 import torch.nn as nn
+from diffusers.image_processor import VaeImageProcessor
 SAMPLE_SIZE = 256
 LATENT_SIZE = 32
     pipeline.save_pretrained(model_file)
 def transform_images(image):
     pil_to_tensor = T.PILToTensor()
     scale_factor = np.minimum(SAMPLE_SIZE / image.width, SAMPLE_SIZE / image.height)
     image = Image.resize(image, size=(int(scale_factor * image.width), int(scale_factor * image.height)), resample=Resampling.NEAREST)
     new_image = PIL.Image.new('RGB', (SAMPLE_SIZE, SAMPLE_SIZE))
     new_image.paste(image, box=(int((SAMPLE_SIZE - image.width)/2), int((SAMPLE_SIZE - image.height)/2)))
+    return pil_to_tensor(new_image)
 def convert_images(dataset):
     images = [transform_images(image) for image in dataset["image"]]
 def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timesteps=100, save_model_interval=10, start_learning_rate=1e-4, lr_warmup_steps=500):
+    vae_image_processor = VaeImageProcessor()
     dataset = load_dataset('frutiemax/rct_dataset', split=f'train[0:{total_images}]')
     dataset.set_transform(convert_images)
     num_images = dataset.num_rows
         num_warmup_steps=lr_warmup_steps,
         num_training_steps=num_images * epochs
     )
+    model = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder, vae_image_processor)
     unet = unet.to('cuda')
     train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
                 to(device='cuda')
             # use the vae to get the latent images
+            clean_images = vae_image_processor.preprocess(clean_images)
             latent_images = vae.encode(clean_images).latent_dist.sample()
             latent_images = latent_images * vae.config.scaling_factor
 if __name__ == '__main__':
+    train_model(batch_size=1, total_images=4, save_model_interval=25, epochs=500, start_learning_rate=1e-5)