0811-170519

Browse files

Files changed (4) hide show

context_unet.py +2 -2
diffusion.py +22 -8
phoenix_diffusion.sbatch +3 -3
quantify_results.ipynb +0 -0

context_unet.py CHANGED Viewed

@@ -533,7 +533,7 @@ class ContextUnet(nn.Module):
             text_outputs = self.token_embedding(y.to(self.dtype))
             emb = emb + text_outputs.to(emb)
-        #print("forward, h = x.type(self.dtype), self.dtype =", self.dtype)
         h = x.type(self.dtype)
         #print("0,h.shape =", h.shape)
         for module in self.input_blocks:
@@ -551,7 +551,7 @@ class ContextUnet(nn.Module):
             h = module(h, emb)
             # print("module decoder, h.shape =", h.shape)
-        #print("h = h.type(x.dtype), x.dtype =", x.dtype)
         h = h.type(x.dtype)
         h = self.out(h)
         #print("self.out(h)", "h.shape =", h.shape)

             text_outputs = self.token_embedding(y.to(self.dtype))
             emb = emb + text_outputs.to(emb)
+        print("forward, h = x.type(self.dtype), self.dtype =", self.dtype)
         h = x.type(self.dtype)
         #print("0,h.shape =", h.shape)
         for module in self.input_blocks:
             h = module(h, emb)
             # print("module decoder, h.shape =", h.shape)
+        print("h = h.type(x.dtype), x.dtype =", x.dtype)
         h = h.type(x.dtype)
         h = self.out(h)
         #print("self.out(h)", "h.shape =", h.shape)

diffusion.py CHANGED Viewed

@@ -208,9 +208,9 @@ class DDPMScheduler(nn.Module):
                 # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
             # print("x_i.shape =", x_i.shape)
-            #print(f"before, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
             x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
-            #print(f"after, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
             pbar_sample.update(1)
@@ -268,7 +268,7 @@ class TrainConfig:
     # dim = 2
     dim = 3#2
     stride = (2,4) if dim == 2 else (2,2,2)
-    num_image = 3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
     batch_size = 5#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
     n_epoch = 50#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
@@ -313,8 +313,8 @@ class TrainConfig:
     # params =  params
     # data_dir = './data' # data directory
-    use_fp16 = True
-    dtype = torch.float16 if use_fp16 else torch.float32
     mixed_precision = "fp16"
     gradient_accumulation_steps = 1
@@ -522,9 +522,9 @@ class DDPM21CM:
                     # print("x = x.to(self.config.device), x.dtype =", x.dtype)
                     # x = x.to(self.config.dtype)
                     # print("x = x.to(self.dtype), x.dtype =", x.dtype)
-                    #print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
                     xt, noise, ts = self.ddpm.add_noise(x)
-                    #print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
                     if self.config.guide_w == -1:
                         noise_pred = self.nn_model(xt, ts)
                     else:
@@ -644,6 +644,20 @@ class DDPM21CM:
         # nn_model = ContextUnet(n_param=1, image_size=28)
         # nn_model.train()
         # self.nn_model.to(self.ddpm.device)
         self.nn_model.eval()
         # self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device)
@@ -751,7 +765,7 @@ if __name__ == "__main__":
         max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
         config = TrainConfig()
         #config.world_size = world_size
         config.resume = args.resume
         # config.resume = f"./outputs/model_state-N30-device_count3-epoch4-172.27.149.181"
         # config.resume = f"./outputs/model_state-N{config.num_image}-device_count{world_size}-epoch{config.n_epoch-1}"

                 # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
             # print("x_i.shape =", x_i.shape)
+            print(f"before, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
             x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
+            print(f"after, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
             pbar_sample.update(1)
     # dim = 2
     dim = 3#2
     stride = (2,4) if dim == 2 else (2,2,2)
+    num_image = 30#00#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
     batch_size = 5#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
     n_epoch = 50#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
     HII_DIM = 64
     # params =  params
     # data_dir = './data' # data directory
+    #use_fp16 = True
+    dtype = torch.float32 #if use_fp16 else torch.float32
     mixed_precision = "fp16"
     gradient_accumulation_steps = 1
                     # print("x = x.to(self.config.device), x.dtype =", x.dtype)
                     # x = x.to(self.config.dtype)
                     # print("x = x.to(self.dtype), x.dtype =", x.dtype)
+                    print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
                     xt, noise, ts = self.ddpm.add_noise(x)
+                    print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
                     if self.config.guide_w == -1:
                         noise_pred = self.nn_model(xt, ts)
                     else:
         # nn_model = ContextUnet(n_param=1, image_size=28)
         # nn_model.train()
         # self.nn_model.to(self.ddpm.device)
+        self.accelerator = Accelerator(
+            mixed_precision=self.config.mixed_precision,
+            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
+            log_with="tensorboard",
+            project_dir=os.path.join(self.config.output_dir, "logs"),
+            # distributed_type="MULTI_GPU",
+        )
+        self.nn_model, self.optimizer, self.lr_scheduler = \
+            self.accelerator.prepare(
+                self.nn_model, self.optimizer, self.lr_scheduler
+                )
         self.nn_model.eval()
         # self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device)
         max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
         config = TrainConfig()
         #config.world_size = world_size
+        # config.dtype = torch.float32
         config.resume = args.resume
         # config.resume = f"./outputs/model_state-N30-device_count3-epoch4-172.27.149.181"
         # config.resume = f"./outputs/model_state-N{config.num_image}-device_count{world_size}-epoch{config.n_epoch-1}"

phoenix_diffusion.sbatch CHANGED Viewed

@@ -5,7 +5,7 @@
 #SBATCH -N1 --gpus-per-node=V100:1 -C V100-32GB              # Number of nodes and cores per node required
 #SBATCH --ntasks-per-node=1
 #SBATCH --mem-per-gpu=16G                        # Memory per core
-#SBATCH -t 08:00:00                                    # Duration of the job (Ex: 15 mins)
 #SBATCH -oReport-%j                         # Combined output and error messages file
 #SBATCH --error=error-%j
 #SBATCH --mail-type=BEGIN,END,FAIL              # Mail preferences
@@ -30,9 +30,9 @@ export MASTER_PORT=$MASTER_PORT
 srun python diffusion.py \
     --train 1 \
-    --resume outputs/model_state-N480-device_count1-node4-epoch49-172.27.149.66 \
     --num_new_img_per_gpu 50 \
-    --max_num_img_per_gpu 2 \
 ######################################################################################

 #SBATCH -N1 --gpus-per-node=V100:1 -C V100-32GB              # Number of nodes and cores per node required
 #SBATCH --ntasks-per-node=1
 #SBATCH --mem-per-gpu=16G                        # Memory per core
+#SBATCH -t 00:30:00                                    # Duration of the job (Ex: 15 mins)
 #SBATCH -oReport-%j                         # Combined output and error messages file
 #SBATCH --error=error-%j
 #SBATCH --mail-type=BEGIN,END,FAIL              # Mail preferences
 srun python diffusion.py \
     --train 1 \
+    --resume outputs/model_state-N3000-device_count1-node2-epoch49-172.27.149.67 \
     --num_new_img_per_gpu 50 \
+    --max_num_img_per_gpu 5 \
 ######################################################################################

quantify_results.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff