frutiemax
/

rct_model

Model card Files Files and versions

xet

Community

frutiemax commited on Sep 24, 2023

Commit

6fa0b52

1 Parent(s): 5961f34

Use SGD and text encoder/tokenizer

Browse files

Files changed (3) hide show

rct_diffusion_pipeline.py +70 -7
test_pipeline.py +21 -3
train_model.py +82 -46

rct_diffusion_pipeline.py CHANGED Viewed

@@ -12,7 +12,7 @@ import pandas as pd
 from tqdm.auto import tqdm
 class RCTDiffusionPipeline(DiffusionPipeline):
-    def __init__(self, unet, scheduler, vae, latent_size=32, sample_size=256):
         super().__init__()
         # dictionnary that keeps the different classes of object description, color1, color2 and color3
@@ -26,11 +26,13 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         self.vae = vae
         self.latent_size = latent_size
         self.sample_size = sample_size
         # channels for 1 image
         self.num_channels = int(self.unet.config.in_channels / 4)
         self.load_dictionaries_from_dataset()
-        self.register_modules(unet=unet, scheduler=scheduler, vae=vae)
     def load_dictionaries_from_dataset(self):
         dataset = load_dataset('frutiemax/rct_dataset')
@@ -177,13 +179,72 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         return torch.reshape(noise_batches, (batch_size, 1, self.num_channels*4, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
-    def __call__(self, object_description : list[list[tuple[str, float]]], color1 : list[list[tuple[str, float]]], \
-                color2 : list[list[tuple[str, float]]] = None, color3 : list[list[tuple[str, float]]] = None, \
                  batch_size=1, num_inference_steps=20, generator=torch.manual_seed(torch.random.seed())):
-        class_labels = self.get_class_labels(object_description, color1, color2, color3, batch_size).to(device='cuda', dtype=torch.float16)
-        if class_labels == None:
             return None
         # set the inference steps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -196,8 +257,9 @@ class RCTDiffusionPipeline(DiffusionPipeline):
             progress_bar.set_description(f'Inference step {epoch}')
             for batch_index in range(batch_size):
                 with torch.no_grad():
-                    noise_residual = self.unet(noise_batches[batch_index], t, encoder_hidden_states=class_labels).sample
                 previous_noisy_sample = self.scheduler.step(noise_residual, t, noise_batches[batch_index]).prev_sample
                 noise_batches[batch_index] = previous_noisy_sample
             progress_bar.update(1)
@@ -223,6 +285,7 @@ class RCTDiffusionPipeline(DiffusionPipeline):
                 image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
                 image = (image * 255).round().astype("uint8")
                 image = Image.fromarray(image)
                 output_images.append(image)
         # for now just return the images

 from tqdm.auto import tqdm
 class RCTDiffusionPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler, vae, text_tokenizer, text_encoder, latent_size=32, sample_size=256):
         super().__init__()
         # dictionnary that keeps the different classes of object description, color1, color2 and color3
         self.vae = vae
         self.latent_size = latent_size
         self.sample_size = sample_size
+        self.text_encoder = text_encoder
+        self.text_tokenizer = text_tokenizer
         # channels for 1 image
         self.num_channels = int(self.unet.config.in_channels / 4)
         self.load_dictionaries_from_dataset()
+        self.register_modules(unet=unet, scheduler=scheduler, vae=vae, text_tokenizer=text_tokenizer, text_encoder=text_encoder)
     def load_dictionaries_from_dataset(self):
         dataset = load_dataset('frutiemax/rct_dataset')
         return torch.reshape(noise_batches, (batch_size, 1, self.num_channels*4, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
+    def test_generate_embeddings(self, object_description, color1, color2, color3) -> torch.Tensor:
+        batch_size = len(object_description)
+        embeddings = torch.Tensor(size=(batch_size, 77, 768))
+        for batch_index in range(batch_size):
+            prompt = f'{object_description[batch_index]},{color1[batch_index]},{color2[batch_index]}, {color3[batch_index]}'
+            tokens = self.text_tokenizer(prompt, \
+                                                    padding="max_length", max_length=self.text_tokenizer.model_max_length, truncation=True, return_tensors="pt")
+            with torch.no_grad():
+                embeddings[batch_index] = self.text_encoder(tokens.input_ids.to('cuda'))[0]
+        return embeddings.to(dtype=torch.float16)
+    def generate_embeddings(self, object_description, color1, color2, color3) -> torch.Tensor:
+        batch_size = len(object_description)
+        embeddings = torch.Tensor(size=(batch_size, 77, 768 * 4))
+        for batch_index in range(batch_size):
+            object_description_tokens = self.text_tokenizer(object_description[batch_index], \
+                                                    padding="max_length", max_length=self.text_tokenizer.model_max_length, truncation=True, return_tensors="pt")
+            color1_tokens = self.text_tokenizer(color1[batch_index], \
+                                                    padding="max_length", max_length=self.text_tokenizer.model_max_length, truncation=True, return_tensors="pt")
+            color2_tokens = self.text_tokenizer(color2[batch_index], \
+                                                    padding="max_length", max_length=self.text_tokenizer.model_max_length, truncation=True, return_tensors="pt")
+            color3_tokens = self.text_tokenizer(color3[batch_index], \
+                                                    padding="max_length", max_length=self.text_tokenizer.model_max_length, truncation=True, return_tensors="pt")
+            with torch.no_grad():
+                object_description_embeddings = self.text_encoder(object_description_tokens.input_ids.to('cuda'))[0]
+                color1_embeddings = self.text_encoder(color1_tokens.input_ids.to('cuda'))[0]
+                color2_embeddings = self.text_encoder(color2_tokens.input_ids.to('cuda'))[0]
+                color3_embeddings = self.text_encoder(color3_tokens.input_ids.to('cuda'))[0]
+                emb = torch.cat([object_description_embeddings, color1_embeddings, color2_embeddings, color3_embeddings], dim=2)
+                embeddings[batch_index] = emb
+        return embeddings.to(dtype=torch.float16)
+    def validate_inputs(self, object_description : list[str], color1 : list[str], \
+                color2 : list[str], color3 : list[str], batch_size) -> tuple[bool, list[str], list[str], list[str], list[str]]:
+        # check if the labels sizes are correct
+        if len(object_description) != batch_size:
+            return False
+        if len(color1) != batch_size:
+            return False
+        if color2 == None:
+            color2 = ['none'] * batch_size
+        elif len(color2) != batch_size:
+            return False
+        if color3 == None:
+            color3 = ['none'] * batch_size
+        elif len(color3) != batch_size:
+            return False
+        return True, object_description, color1, color2, color3
+    def __call__(self, object_description : list[str], color1 : list[str], \
+                color2 : list[str] = None, color3 : list[str] = None, \
                  batch_size=1, num_inference_steps=20, generator=torch.manual_seed(torch.random.seed())):
+        res, object_description, color1, color2, color3 = self.validate_inputs(object_description, color1, color2, color3, batch_size)
+        if res == False:
             return None
+        embeddings = self.test_generate_embeddings(object_description, color1, color2, color3)
+        embeddings = embeddings.to('cuda')
         # set the inference steps
         self.scheduler.set_timesteps(num_inference_steps)
             progress_bar.set_description(f'Inference step {epoch}')
             for batch_index in range(batch_size):
+                noise_batches[batch_index] = self.scheduler.scale_model_input(noise_batches[batch_index], timestep=t)
                 with torch.no_grad():
+                    noise_residual = self.unet(noise_batches[batch_index], t, encoder_hidden_states=embeddings).sample
                 previous_noisy_sample = self.scheduler.step(noise_residual, t, noise_batches[batch_index]).prev_sample
                 noise_batches[batch_index] = previous_noisy_sample
             progress_bar.update(1)
                 image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
                 image = (image * 255).round().astype("uint8")
                 image = Image.fromarray(image)
+                image.save(f'test{image_index}.png')
                 output_images.append(image)
         # for now just return the images

test_pipeline.py CHANGED Viewed

@@ -1,20 +1,38 @@
 from rct_diffusion_pipeline import RCTDiffusionPipeline
 from diffusers import UNet2DConditionModel, DDPMScheduler, AutoencoderKL
 import torch
 torch_device = "cuda"
 unet = UNet2DConditionModel(sample_size=32, in_channels=16, out_channels=16, \
                         down_block_types=('CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D'),\
-                              up_block_types=('UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D'), cross_attention_dim=160,
                             block_out_channels=(64, 128, 256), norm_num_groups=32)
 unet = unet.to('cuda', dtype=torch.float16)
 scheduler = DDPMScheduler(num_train_timesteps=20)
 vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
 vae = vae.to('cuda', dtype=torch.float16)
-pipeline = RCTDiffusionPipeline(unet, scheduler, vae)
-output = pipeline([[('aleppo pine tree', 1.0)]], [[('dark green', 1.0)]])
 pipeline.save_pretrained('test')
 # from PIL import Image

 from rct_diffusion_pipeline import RCTDiffusionPipeline
 from diffusers import UNet2DConditionModel, DDPMScheduler, AutoencoderKL
 import torch
+from transformers import CLIPTextModel, CLIPTokenizer
 torch_device = "cuda"
+# test of text tokenizers
+tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
+text_encoder = CLIPTextModel.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
+).to('cuda')
+test1 = tokenizer(['aleppo pine tree, common oak tree'], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
+#test3 = tokenizer([1.0, 0.0, .05], is_split_into_words=True, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
+with torch.no_grad():
+    test1 = text_encoder(test1.input_ids.to('cuda'))[0]
+test2 = tokenizer('dark green', padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
+with torch.no_grad():
+    test2 = text_encoder(test2.input_ids.to('cuda'))[0]
 unet = UNet2DConditionModel(sample_size=32, in_channels=16, out_channels=16, \
                         down_block_types=('CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D'),\
+                              up_block_types=('UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D'), cross_attention_dim=768*4,
                             block_out_channels=(64, 128, 256), norm_num_groups=32)
 unet = unet.to('cuda', dtype=torch.float16)
 scheduler = DDPMScheduler(num_train_timesteps=20)
 vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
 vae = vae.to('cuda', dtype=torch.float16)
+pipeline = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder)
+output = pipeline(['aleppo pine tree'], ['dark green'])
 pipeline.save_pretrained('test')
 # from PIL import Image

train_model.py CHANGED Viewed

@@ -11,6 +11,7 @@ from diffusers.optimization import get_cosine_schedule_with_warmup
 from tqdm.auto import tqdm
 from accelerate import Accelerator
 from diffusers import DDPMScheduler, UNet2DConditionModel, AutoencoderKL
 SAMPLE_SIZE = 256
 LATENT_SIZE = 32
@@ -18,12 +19,12 @@ SAMPLE_NUM_CHANNELS = 3
 LATENT_NUM_CHANNELS = 4
 def save_and_test(pipeline, epoch):
-    outputs = pipeline([[('aleppo pine tree', 1.0)]], [[('dark green', 1.0)]])
     for image_index in range(len(outputs)):
         file_name = f'out{image_index}_{epoch}.png'
         outputs[image_index].save(file_name)
-    model_file = f'rct_foliage_{epoch}.pth'
     pipeline.save_pretrained(model_file)
 def convert_images(dataset):
@@ -42,18 +43,18 @@ def convert_images(dataset):
         for entry in views[view_index]:
             image = entry['image']
-            scale_factor = int(np.minimum(SAMPLE_SIZE / image.width, SAMPLE_SIZE / image.height))
-            image = Image.resize(image, size=(scale_factor * image.width, scale_factor * image.height), resample=Resampling.NEAREST)
-            new_image = PIL.Image.new('RGB', (SAMPLE_SIZE, SAMPLE_SIZE))
-            new_image.paste(image, box=(int((SAMPLE_SIZE - image.width)/2), int((SAMPLE_SIZE - image.height)/2)))
             images.append(new_image)
         image_views.append(images)
     del views
     # convert those views in tensors
-    targets = torch.Tensor(size=(num_images, 4, SAMPLE_NUM_CHANNELS, SAMPLE_SIZE, SAMPLE_SIZE)).to(dtype=torch.float16)
     pillow_to_tensor = T.ToTensor()
     for image_index in range(num_images):
@@ -62,7 +63,7 @@ def convert_images(dataset):
     del image_views
     del entries
-    return torch.reshape(targets, (num_images, 4 * SAMPLE_NUM_CHANNELS, SAMPLE_SIZE, SAMPLE_SIZE))
 def convert_labels(dataset, model, num_images):
     # get the labels
@@ -96,80 +97,115 @@ def convert_labels(dataset, model, num_images):
     del dataset
     return class_labels.to(dtype=torch.float16, device='cuda')
-def train_model(batch_size=4, epochs=100, scheduler_num_timesteps=20, save_model_interval=10, start_learning_rate=1e-3, lr_warmup_steps=500):
     dataset = load_dataset('frutiemax/rct_dataset')
     dataset = dataset['train']
     targets = convert_images(dataset)
-    num_images = int(dataset.num_rows / 4)
-    unet = UNet2DConditionModel(sample_size=LATENT_SIZE, in_channels=LATENT_NUM_CHANNELS * 4, out_channels=LATENT_NUM_CHANNELS * 4, \
-                        down_block_types=('CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D'),\
-                              up_block_types=('UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D'), cross_attention_dim=160,
-                            block_out_channels=(64, 128, 256), norm_num_groups=32)
     unet = unet.to(dtype=torch.float16)
-    scheduler = DDPMScheduler(num_train_timesteps=20)
     vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
     vae = vae.to(dtype=torch.float16)
-    optimizer = torch.optim.Adam(unet.parameters(), lr=start_learning_rate)
     lr_scheduler = get_cosine_schedule_with_warmup(
         optimizer=optimizer,
         num_warmup_steps=lr_warmup_steps,
         num_training_steps=num_images * epochs
     )
-    model = RCTDiffusionPipeline(unet, scheduler, vae)
     labels = convert_labels(dataset, model, num_images)
     del model
     # lets train for 100 epoch for each sprite in the dataset with a random noise level
     progress_bar = tqdm(total=epochs)
     accelerator = Accelerator(mixed_precision='fp16')
     unet, scheduler, lr_scheduler, vae = accelerator.prepare(unet, scheduler, lr_scheduler, vae)
     for epoch in range(epochs):
         # create a noisy version of each sprite
         for batch_index in range(0, num_images, batch_size):
-            progress_bar.set_description(f'epoch={epoch}, batch_index={batch_index}')
             batch_end = np.minimum(num_images, batch_index + batch_size)
             clean_images = targets[batch_index:batch_end]
-            clean_images = torch.reshape(clean_images, ((batch_end - batch_index), SAMPLE_NUM_CHANNELS * 4, SAMPLE_SIZE, SAMPLE_SIZE)).to(device='cuda', dtype=torch.float16)
             noise = torch.randn(clean_images.shape, dtype=torch.float16, device='cuda')
             timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (batch_end - batch_index, )).to(device='cuda')
             #timesteps = timesteps.to(dtype=torch.int, device='cuda')
             noisy_images = scheduler.add_noise(clean_images, noise, timesteps)
-            del clean_images
-            # encode through the vae
-            with accelerator.accumulate(unet):
-                latent_images = torch.Tensor(size=(batch_end - batch_index, LATENT_NUM_CHANNELS * 4, LATENT_SIZE, LATENT_SIZE)).to(device='cuda', dtype=torch.float16)
-                latent_noises = torch.Tensor(size=(batch_end - batch_index, LATENT_NUM_CHANNELS * 4, LATENT_SIZE, LATENT_SIZE)).to(device='cuda', dtype=torch.float16)
-                for view_index in range(4):
-                    images = noisy_images[:, view_index*SAMPLE_NUM_CHANNELS:(view_index+1)*SAMPLE_NUM_CHANNELS]
-                    result = vae.encode(images).latent_dist.sample()
-                    latent_images[:, view_index*LATENT_NUM_CHANNELS:(view_index+1)*LATENT_NUM_CHANNELS] = result
-                    images = noise[:, view_index*SAMPLE_NUM_CHANNELS:(view_index+1)*SAMPLE_NUM_CHANNELS]
-                    result = vae.encode(images).latent_dist.sample()
-                    latent_noises[:, view_index*LATENT_NUM_CHANNELS:(view_index+1)*LATENT_NUM_CHANNELS] = result
-                del noise
-                del noisy_images
-                unet_results = unet(latent_images, timesteps, labels[batch_index:batch_end])[0]
-                unet_results = unet_results.to(dtype=torch.float16)
-                loss = F.mse_loss(unet_results, latent_noises)
-                accelerator.backward(loss)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
         if (epoch + 1) % save_model_interval == 0:
-            model = RCTDiffusionPipeline(accelerator.unwrap_model(unet), scheduler, vae)
             save_and_test(model, epoch)
         progress_bar.update(1)
 if __name__ == '__main__':
-    train_model(1, save_model_interval=1)

 from tqdm.auto import tqdm
 from accelerate import Accelerator
 from diffusers import DDPMScheduler, UNet2DConditionModel, AutoencoderKL
+from transformers import CLIPTextModel, CLIPTokenizer
 SAMPLE_SIZE = 256
 LATENT_SIZE = 32
 LATENT_NUM_CHANNELS = 4
 def save_and_test(pipeline, epoch):
+    outputs = pipeline(['aleppo pine tree'], ['dark green'])
     for image_index in range(len(outputs)):
         file_name = f'out{image_index}_{epoch}.png'
         outputs[image_index].save(file_name)
+    model_file = f'rct_foliage_{epoch}'
     pipeline.save_pretrained(model_file)
 def convert_images(dataset):
         for entry in views[view_index]:
             image = entry['image']
+            scale_factor = np.minimum(LATENT_SIZE / image.width, LATENT_SIZE / image.height)
+            image = Image.resize(image, size=(int(scale_factor * image.width), int(scale_factor * image.height)), resample=Resampling.NEAREST)
+            new_image = PIL.Image.new('RGBA', (LATENT_SIZE, LATENT_SIZE))
+            new_image.paste(image, box=(int((LATENT_SIZE - image.width)/2), int((LATENT_SIZE - image.height)/2)))
             images.append(new_image)
         image_views.append(images)
     del views
     # convert those views in tensors
+    targets = torch.Tensor(size=(num_images, 4, LATENT_NUM_CHANNELS, LATENT_SIZE, LATENT_SIZE)).to(dtype=torch.float16)
     pillow_to_tensor = T.ToTensor()
     for image_index in range(num_images):
     del image_views
     del entries
+    return torch.reshape(targets, (num_images, 4 * LATENT_NUM_CHANNELS, LATENT_SIZE, LATENT_SIZE))
 def convert_labels(dataset, model, num_images):
     # get the labels
     del dataset
     return class_labels.to(dtype=torch.float16, device='cuda')
+def train_model(batch_size=4, total_images=None, epochs=100, scheduler_num_timesteps=20, save_model_interval=10, start_learning_rate=1e-3, lr_warmup_steps=1):
     dataset = load_dataset('frutiemax/rct_dataset')
     dataset = dataset['train']
     targets = convert_images(dataset)
+    num_images = int(dataset.num_rows / 4) if total_images == None else int(total_images / 4)
+    unet = UNet2DConditionModel(sample_size=LATENT_SIZE, in_channels=LATENT_NUM_CHANNELS*4, out_channels=LATENT_NUM_CHANNELS*4, \
+                        down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D", "DownBlock2D"),\
+                              up_block_types=("UpBlock2D","CrossAttnUpBlock2D","CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), cross_attention_dim=768,
+                            block_out_channels=(320, 640, 1280, 1280), norm_num_groups=32)
     unet = unet.to(dtype=torch.float16)
+    scheduler = DDPMScheduler(num_train_timesteps=scheduler_num_timesteps)
+    tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(
+        "CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
+    ).to('cuda')
     vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
     vae = vae.to(dtype=torch.float16)
+    optimizer = torch.optim.SGD(unet.parameters(), lr=start_learning_rate)
     lr_scheduler = get_cosine_schedule_with_warmup(
         optimizer=optimizer,
         num_warmup_steps=lr_warmup_steps,
         num_training_steps=num_images * epochs
     )
+    model = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder)
+    # get all the object descriptions, color1, color2, color3
+    object_descriptions = dataset['object_description']
+    colors1 = dataset['color1']
+    colors2 = dataset['color2']
+    colors3 = dataset['color3']
+    # we only need 1 of the 4 views
+    object_descriptions = [object_descriptions[desc_index] for desc_index in range(0, len(object_descriptions), 4)]
+    colors1 = [colors1[desc_index] for desc_index in range(0, len(colors1), 4)]
+    colors2 = [colors2[desc_index] for desc_index in range(0, len(colors2), 4)]
+    colors3 = [colors3[desc_index] for desc_index in range(0, len(colors3), 4)]
+    #embeddings = model.generate_embeddings(object_descriptions, colors1, colors2, colors3)
+    embeddings = model.test_generate_embeddings(object_descriptions, colors1, colors2, colors3)
     labels = convert_labels(dataset, model, num_images)
     del model
+    if total_images != None:
+        targets = targets[:int(total_images/4)]
+        label_indices = [index for index in range(0, total_images, 4)]
+        labels = labels[label_indices]
     # lets train for 100 epoch for each sprite in the dataset with a random noise level
     progress_bar = tqdm(total=epochs)
     accelerator = Accelerator(mixed_precision='fp16')
+    accelerator.clip_grad_norm_(unet.parameters(), 1.0)
     unet, scheduler, lr_scheduler, vae = accelerator.prepare(unet, scheduler, lr_scheduler, vae)
+    loss_fn = torch.nn.MSELoss()
+    tensor_to_pillow = T.ToPILImage()
     for epoch in range(epochs):
         # create a noisy version of each sprite
         for batch_index in range(0, num_images, batch_size):
             batch_end = np.minimum(num_images, batch_index + batch_size)
             clean_images = targets[batch_index:batch_end]
+            clean_images = torch.reshape(clean_images, ((batch_end - batch_index), LATENT_NUM_CHANNELS * 4, LATENT_SIZE, LATENT_SIZE)).\
+                to(device='cuda', dtype=torch.float16)
             noise = torch.randn(clean_images.shape, dtype=torch.float16, device='cuda')
             timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (batch_end - batch_index, )).to(device='cuda')
             #timesteps = timesteps.to(dtype=torch.int, device='cuda')
             noisy_images = scheduler.add_noise(clean_images, noise, timesteps)
+            # with accelerator.accumulate(unet):
+            #     assert not torch.any(torch.isnan(timesteps))
+            #     batch_embeddings = embeddings[batch_index:batch_end]
+            #     batch_embeddings = batch_embeddings.to('cuda')
+            #     optimizer.zero_grad()
+            #     unet_results = unet(noisy_images, timesteps, batch_embeddings).sample
+            #     unet_results = unet_results.to(dtype=torch.float16)
+            #     loss = loss_fn(unet_results, noise)
+            #     accelerator.backward(loss)
+            #     optimizer.step()
+            #     lr_scheduler.step()
+            #     optimizer.zero_grad()
+            batch_embeddings = embeddings[batch_index:batch_end]
+            batch_embeddings = batch_embeddings.to('cuda')
+            optimizer.zero_grad()
+            unet_results = unet(noisy_images, timesteps, batch_embeddings).sample
+            unet_results = unet_results.to(dtype=torch.float16)
+            loss = loss_fn(unet_results, noise)
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+            progress_bar.set_description(f'epoch={epoch}, batch_index={batch_index}, last_loss={loss.item()}')
         if (epoch + 1) % save_model_interval == 0:
+            model = RCTDiffusionPipeline(accelerator.unwrap_model(unet), scheduler, vae, tokenizer, text_encoder)
             save_and_test(model, epoch)
         progress_bar.update(1)
 if __name__ == '__main__':
+    train_model(1, total_images=4, save_model_interval=1)