Xsmos commited on
Commit
f43e10e
·
verified ·
1 Parent(s): 74c3051
Files changed (2) hide show
  1. diffusion.py +5 -5
  2. perlmutter_diffusion.sbatch +1 -1
diffusion.py CHANGED
@@ -504,9 +504,9 @@ class DDPM21CM:
504
 
505
 
506
  # print("!!!!!!!!!!!!!!!!, before prepare, self.dataloader.sampler =", self.dataloader.sampler)
 
507
  print(f"model: {self.nn_model.device}, cuda:{torch.cuda.current_device()}/{self.config.global_rank}")
508
  #print(f"optimizer: {self.optimizer.state_dict()}")
509
- print_start = time()
510
  print(f"dataloader: {next(iter(self.dataloader))[0].device}")
511
  print(f"lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}")
512
  print_end = time()
@@ -527,7 +527,7 @@ class DDPM21CM:
527
  # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.sampler =", self.dataloader.sampler)
528
  # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.batch_sampler =", self.dataloader.batch_sampler)
529
  # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.DistributedSampler =", self.dataloader.DistributedSampler)
530
- train_start = time()
531
  global_step = 0
532
  for ep in range(self.config.n_epoch):
533
  self.ddpm.train()
@@ -537,9 +537,9 @@ class DDPM21CM:
537
  #train_end = time()
538
  #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
539
  for i, (x, c) in enumerate(self.dataloader):
540
- if i == 0:
541
- train_end = time()
542
- print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
543
 
544
  # print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
545
  with self.accelerator.accumulate(self.nn_model):
 
504
 
505
 
506
  # print("!!!!!!!!!!!!!!!!, before prepare, self.dataloader.sampler =", self.dataloader.sampler)
507
+ print_start = time()
508
  print(f"model: {self.nn_model.device}, cuda:{torch.cuda.current_device()}/{self.config.global_rank}")
509
  #print(f"optimizer: {self.optimizer.state_dict()}")
 
510
  print(f"dataloader: {next(iter(self.dataloader))[0].device}")
511
  print(f"lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}")
512
  print_end = time()
 
527
  # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.sampler =", self.dataloader.sampler)
528
  # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.batch_sampler =", self.dataloader.batch_sampler)
529
  # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.DistributedSampler =", self.dataloader.DistributedSampler)
530
+ #train_start = time()
531
  global_step = 0
532
  for ep in range(self.config.n_epoch):
533
  self.ddpm.train()
 
537
  #train_end = time()
538
  #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
539
  for i, (x, c) in enumerate(self.dataloader):
540
+ #if i == 0:
541
+ # train_end = time()
542
+ # print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
543
 
544
  # print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
545
  with self.accelerator.accumulate(self.nn_model):
perlmutter_diffusion.sbatch CHANGED
@@ -18,7 +18,7 @@ conda activate diffusers
18
  conda env list
19
  module list
20
  which python
21
- srun python -c "import torch; print(torch.cuda.device_count(), torch.__path__, torch.version.cuda); import accelerate; print(accelerate.__version__, accelerate.__path__))"
22
  cat $0
23
 
24
  MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 
18
  conda env list
19
  module list
20
  which python
21
+ srun python -c "import torch; print(torch.cuda.device_count(), torch.__path__, torch.version.cuda); import accelerate; print(accelerate.__version__, accelerate.__path__)"
22
  cat $0
23
 
24
  MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)