Xsmos commited on
Commit
86215c4
·
verified ·
1 Parent(s): a11de78
diffusion.py CHANGED
@@ -429,10 +429,6 @@ class DDPM21CM:
429
  self.ema_model = copy.deepcopy(self.nn_model).eval().requires_grad_(False)
430
 
431
  self.optimizer = torch.optim.AdamW(self.nn_model.parameters(), lr=config.lrate)
432
- #self.lr_scheduler = get_cosine_schedule_with_warmup(
433
- # optimizer=self.optimizer,
434
- # num_training_steps=int(config.num_image / config.batch_size * config.n_epoch / config.gradient_accumulation_steps),
435
- #)
436
  self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
437
  optimizer = self.optimizer,
438
  T_max = int(config.num_image / config.batch_size * config.n_epoch / config.gradient_accumulation_steps),
@@ -569,16 +565,12 @@ class DDPM21CM:
569
  else:
570
  c = c.to(self.config.device)
571
  noise_pred = self.nn_model(xt, ts, c).to(x.dtype)
572
-
573
- # print("noise_pred = self.nn_model(xt, ts, c), noise_pred.dtype =", noise_pred.dtype, noise.dtype)
574
 
575
  loss = F.mse_loss(noise, noise_pred)
576
  loss = loss / self.config.gradient_accumulation_steps
577
  loss.backward()
578
- #print(f"loss.dtype =", loss.dtype)
579
- #self.accelerator.backward(loss)
580
- #self.accelerator.clip_grad_norm_(self.nn_model.parameters(), 1)
581
- if (i+i) % self.config.gradient_accumulation_steps == 0:
582
  torch.nn.utils.clip_grad_norm_(self.nn_model.parameters(), max_norm=1.0)
583
  self.optimizer.step()
584
  self.lr_scheduler.step()
 
429
  self.ema_model = copy.deepcopy(self.nn_model).eval().requires_grad_(False)
430
 
431
  self.optimizer = torch.optim.AdamW(self.nn_model.parameters(), lr=config.lrate)
 
 
 
 
432
  self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
433
  optimizer = self.optimizer,
434
  T_max = int(config.num_image / config.batch_size * config.n_epoch / config.gradient_accumulation_steps),
 
565
  else:
566
  c = c.to(self.config.device)
567
  noise_pred = self.nn_model(xt, ts, c).to(x.dtype)
 
 
568
 
569
  loss = F.mse_loss(noise, noise_pred)
570
  loss = loss / self.config.gradient_accumulation_steps
571
  loss.backward()
572
+
573
+ if (i+1) % self.config.gradient_accumulation_steps == 0:
 
 
574
  torch.nn.utils.clip_grad_norm_(self.nn_model.parameters(), max_norm=1.0)
575
  self.optimizer.step()
576
  self.lr_scheduler.step()
perlmutter_diffusion.sbatch CHANGED
@@ -5,7 +5,7 @@
5
  #SBATCH -q shared #regular
6
  #SBATCH -N1
7
  #SBATCH --gpus-per-node=1
8
- #SBATCH -t 0:50:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
@@ -42,6 +42,6 @@ srun python diffusion.py \
42
  --gradient_accumulation_steps 10 \
43
  --num_new_img_per_gpu 800 \
44
  --max_num_img_per_gpu 80 \
45
- #--resume outputs/model-N3200-device_count4-node4-epoch99-07140940 \
46
 
47
  date
 
5
  #SBATCH -q shared #regular
6
  #SBATCH -N1
7
  #SBATCH --gpus-per-node=1
8
+ #SBATCH -t 0:30:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
 
42
  --gradient_accumulation_steps 10 \
43
  --num_new_img_per_gpu 800 \
44
  --max_num_img_per_gpu 80 \
45
+ #--resume outputs/model-N3200-device_count1-node1-epoch99-07213338 \
46
 
47
  date
quantify_results.ipynb CHANGED
The diff for this file is too large to render. See raw diff