Xsmos commited on
Commit
722bab4
·
verified ·
1 Parent(s): 19a0366
diffusion.py CHANGED
@@ -363,7 +363,7 @@ def get_gpu_info(device):
363
 
364
  class DDPM21CM:
365
  def __init__(self, config):
366
- config.run_name = datetime.now().strftime("%d%H%M%S") # the unique name of each experiment
367
  self.config = config
368
  self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
369
 
@@ -381,7 +381,7 @@ class DDPM21CM:
381
  # print(f"resumed nn_model from {config.resume}")
382
  self.nn_model.module.load_state_dict(torch.load(config.resume)['unet_state_dict'])
383
  #self.nn_model.module.to(config.dtype)
384
- print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters".center(self.config.str_len,'+'))
385
  else:
386
  print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.config.str_len,'+'))
387
 
@@ -713,6 +713,7 @@ if __name__ == "__main__":
713
  parser.add_argument("--autocast", type=int, required=False, default=False)
714
  parser.add_argument("--use_checkpoint", type=int, required=False, default=False)
715
  parser.add_argument("--dropout", type=float, required=False, default=0)
 
716
 
717
  args = parser.parse_args()
718
 
@@ -731,6 +732,7 @@ if __name__ == "__main__":
731
  config.autocast = bool(args.autocast)
732
  config.use_checkpoint = bool(args.use_checkpoint)
733
  config.dropout = args.dropout
 
734
 
735
  ############################ training ################################
736
  if args.train:
 
363
 
364
  class DDPM21CM:
365
  def __init__(self, config):
366
+ config.run_name = os.environ.get("SLURM_JOB_ID", datetime.now().strftime("%d%H%M%S")) # the unique name of each experiment
367
  self.config = config
368
  self.ddpm = DDPMScheduler(betas=(1e-4, 0.02), num_timesteps=config.num_timesteps, img_shape=config.img_shape, device=config.device, config=config,)#, dtype=config.dtype
369
 
 
381
  # print(f"resumed nn_model from {config.resume}")
382
  self.nn_model.module.load_state_dict(torch.load(config.resume)['unet_state_dict'])
383
  #self.nn_model.module.to(config.dtype)
384
+ print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} resumed nn_model from {config.resume} with {sum(x.numel() for x in self.nn_model.parameters())} parameters, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.config.str_len,'+'))
385
  else:
386
  print(f"{config.run_name} cuda:{torch.cuda.current_device()}/{self.config.global_rank} initialized nn_model randomly with {sum(x.numel() for x in self.nn_model.parameters())} parameters, {datetime.now().strftime('%d-%H:%M:%S.%f')}".center(self.config.str_len,'+'))
387
 
 
713
  parser.add_argument("--autocast", type=int, required=False, default=False)
714
  parser.add_argument("--use_checkpoint", type=int, required=False, default=False)
715
  parser.add_argument("--dropout", type=float, required=False, default=0)
716
+ parser.add_argument("--lrate", type=float, required=False, default=1e-4)
717
 
718
  args = parser.parse_args()
719
 
 
732
  config.autocast = bool(args.autocast)
733
  config.use_checkpoint = bool(args.use_checkpoint)
734
  config.dropout = args.dropout
735
+ config.lrate = args.lrate
736
 
737
  ############################ training ################################
738
  if args.train:
perlmutter_diffusion.sbatch CHANGED
@@ -35,10 +35,11 @@ srun python diffusion.py \
35
  --num_new_img_per_gpu 4 \
36
  --max_num_img_per_gpu 2 \
37
  --gradient_accumulation_steps 1 \
38
- --autocast 0 \
39
  --use_checkpoint 1 \
40
  --dropout 0.1 \
41
- --resume ./outputs/model-N1600-device_count4-node4-epoch29-20051216 \
 
42
  #--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
43
 
44
  date
 
35
  --num_new_img_per_gpu 4 \
36
  --max_num_img_per_gpu 2 \
37
  --gradient_accumulation_steps 1 \
38
+ --autocast 1 \
39
  --use_checkpoint 1 \
40
  --dropout 0.1 \
41
+ --lrate 7e-5 \
42
+ --resume ./outputs/model-N1600-device_count4-node4-epoch34-20051216 \
43
  #--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
44
 
45
  date
quantify_results.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0c13ff1f79531d58b80a67cf6ae9141685d85483eac6006d7fc90f33d55283e
3
- size 28539850
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8144cb4596999bacefce85cad17af66ea5ebfd5e11d2517361e365fce9895b45
3
+ size 15754478