Xsmos commited on
Commit
83278af
·
verified ·
1 Parent(s): 44f2584

0811-170519

Browse files
context_unet.py CHANGED
@@ -533,7 +533,7 @@ class ContextUnet(nn.Module):
533
  text_outputs = self.token_embedding(y.to(self.dtype))
534
  emb = emb + text_outputs.to(emb)
535
 
536
- #print("forward, h = x.type(self.dtype), self.dtype =", self.dtype)
537
  h = x.type(self.dtype)
538
  #print("0,h.shape =", h.shape)
539
  for module in self.input_blocks:
@@ -551,7 +551,7 @@ class ContextUnet(nn.Module):
551
  h = module(h, emb)
552
  # print("module decoder, h.shape =", h.shape)
553
 
554
- #print("h = h.type(x.dtype), x.dtype =", x.dtype)
555
  h = h.type(x.dtype)
556
  h = self.out(h)
557
  #print("self.out(h)", "h.shape =", h.shape)
 
533
  text_outputs = self.token_embedding(y.to(self.dtype))
534
  emb = emb + text_outputs.to(emb)
535
 
536
+ print("forward, h = x.type(self.dtype), self.dtype =", self.dtype)
537
  h = x.type(self.dtype)
538
  #print("0,h.shape =", h.shape)
539
  for module in self.input_blocks:
 
551
  h = module(h, emb)
552
  # print("module decoder, h.shape =", h.shape)
553
 
554
+ print("h = h.type(x.dtype), x.dtype =", x.dtype)
555
  h = h.type(x.dtype)
556
  h = self.out(h)
557
  #print("self.out(h)", "h.shape =", h.shape)
diffusion.py CHANGED
@@ -208,9 +208,9 @@ class DDPMScheduler(nn.Module):
208
  # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
209
 
210
  # print("x_i.shape =", x_i.shape)
211
- #print(f"before, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
212
  x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
213
- #print(f"after, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
214
 
215
  pbar_sample.update(1)
216
 
@@ -268,7 +268,7 @@ class TrainConfig:
268
  # dim = 2
269
  dim = 3#2
270
  stride = (2,4) if dim == 2 else (2,2,2)
271
- num_image = 3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
272
  batch_size = 5#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
273
  n_epoch = 50#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
274
  HII_DIM = 64
@@ -313,8 +313,8 @@ class TrainConfig:
313
  # params = params
314
  # data_dir = './data' # data directory
315
 
316
- use_fp16 = True
317
- dtype = torch.float16 if use_fp16 else torch.float32
318
  mixed_precision = "fp16"
319
  gradient_accumulation_steps = 1
320
 
@@ -522,9 +522,9 @@ class DDPM21CM:
522
  # print("x = x.to(self.config.device), x.dtype =", x.dtype)
523
  # x = x.to(self.config.dtype)
524
  # print("x = x.to(self.dtype), x.dtype =", x.dtype)
525
- #print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
526
  xt, noise, ts = self.ddpm.add_noise(x)
527
- #print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
528
  if self.config.guide_w == -1:
529
  noise_pred = self.nn_model(xt, ts)
530
  else:
@@ -644,6 +644,20 @@ class DDPM21CM:
644
  # nn_model = ContextUnet(n_param=1, image_size=28)
645
  # nn_model.train()
646
  # self.nn_model.to(self.ddpm.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
647
  self.nn_model.eval()
648
 
649
  # self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device)
@@ -751,7 +765,7 @@ if __name__ == "__main__":
751
  max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
752
  config = TrainConfig()
753
  #config.world_size = world_size
754
-
755
  config.resume = args.resume
756
  # config.resume = f"./outputs/model_state-N30-device_count3-epoch4-172.27.149.181"
757
  # config.resume = f"./outputs/model_state-N{config.num_image}-device_count{world_size}-epoch{config.n_epoch-1}"
 
208
  # x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
209
 
210
  # print("x_i.shape =", x_i.shape)
211
+ print(f"before, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
212
  x_i = 1/torch.sqrt(self.alpha_t[i])*(x_i-eps*self.beta_t[i]/torch.sqrt(1-self.bar_alpha_t[i])) + torch.sqrt(self.beta_t[i])*z
213
+ print(f"after, x_i.dtype = {x_i.dtype}, beta_t.dtype = {self.beta_t.dtype}, eps.dtype = {eps.dtype}, alpha_t.dtype = {self.alpha_t.dtype}, z.dtype = {z.dtype}")
214
 
215
  pbar_sample.update(1)
216
 
 
268
  # dim = 2
269
  dim = 3#2
270
  stride = (2,4) if dim == 2 else (2,2,2)
271
+ num_image = 30#00#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
272
  batch_size = 5#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
273
  n_epoch = 50#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
274
  HII_DIM = 64
 
313
  # params = params
314
  # data_dir = './data' # data directory
315
 
316
+ #use_fp16 = True
317
+ dtype = torch.float32 #if use_fp16 else torch.float32
318
  mixed_precision = "fp16"
319
  gradient_accumulation_steps = 1
320
 
 
522
  # print("x = x.to(self.config.device), x.dtype =", x.dtype)
523
  # x = x.to(self.config.dtype)
524
  # print("x = x.to(self.dtype), x.dtype =", x.dtype)
525
+ print(f"ddpm.add_noise(x), x.dtype = {x.dtype}")
526
  xt, noise, ts = self.ddpm.add_noise(x)
527
+ print(f"ddpm.add_noise(x), xt.dtype = {xt.dtype}")
528
  if self.config.guide_w == -1:
529
  noise_pred = self.nn_model(xt, ts)
530
  else:
 
644
  # nn_model = ContextUnet(n_param=1, image_size=28)
645
  # nn_model.train()
646
  # self.nn_model.to(self.ddpm.device)
647
+
648
+ self.accelerator = Accelerator(
649
+ mixed_precision=self.config.mixed_precision,
650
+ gradient_accumulation_steps=self.config.gradient_accumulation_steps,
651
+ log_with="tensorboard",
652
+ project_dir=os.path.join(self.config.output_dir, "logs"),
653
+ # distributed_type="MULTI_GPU",
654
+ )
655
+
656
+ self.nn_model, self.optimizer, self.lr_scheduler = \
657
+ self.accelerator.prepare(
658
+ self.nn_model, self.optimizer, self.lr_scheduler
659
+ )
660
+
661
  self.nn_model.eval()
662
 
663
  # self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device)
 
765
  max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
766
  config = TrainConfig()
767
  #config.world_size = world_size
768
+ # config.dtype = torch.float32
769
  config.resume = args.resume
770
  # config.resume = f"./outputs/model_state-N30-device_count3-epoch4-172.27.149.181"
771
  # config.resume = f"./outputs/model_state-N{config.num_image}-device_count{world_size}-epoch{config.n_epoch-1}"
phoenix_diffusion.sbatch CHANGED
@@ -5,7 +5,7 @@
5
  #SBATCH -N1 --gpus-per-node=V100:1 -C V100-32GB # Number of nodes and cores per node required
6
  #SBATCH --ntasks-per-node=1
7
  #SBATCH --mem-per-gpu=16G # Memory per core
8
- #SBATCH -t 08:00:00 # Duration of the job (Ex: 15 mins)
9
  #SBATCH -oReport-%j # Combined output and error messages file
10
  #SBATCH --error=error-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
@@ -30,9 +30,9 @@ export MASTER_PORT=$MASTER_PORT
30
 
31
  srun python diffusion.py \
32
  --train 1 \
33
- --resume outputs/model_state-N480-device_count1-node4-epoch49-172.27.149.66 \
34
  --num_new_img_per_gpu 50 \
35
- --max_num_img_per_gpu 2 \
36
 
37
  ######################################################################################
38
 
 
5
  #SBATCH -N1 --gpus-per-node=V100:1 -C V100-32GB # Number of nodes and cores per node required
6
  #SBATCH --ntasks-per-node=1
7
  #SBATCH --mem-per-gpu=16G # Memory per core
8
+ #SBATCH -t 00:30:00 # Duration of the job (Ex: 15 mins)
9
  #SBATCH -oReport-%j # Combined output and error messages file
10
  #SBATCH --error=error-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
 
30
 
31
  srun python diffusion.py \
32
  --train 1 \
33
+ --resume outputs/model_state-N3000-device_count1-node2-epoch49-172.27.149.67 \
34
  --num_new_img_per_gpu 50 \
35
+ --max_num_img_per_gpu 5 \
36
 
37
  ######################################################################################
38
 
quantify_results.ipynb CHANGED
The diff for this file is too large to render. See raw diff