Xsmos commited on
Commit
d0061ee
·
verified ·
1 Parent(s): 9a8d4b7
Files changed (2) hide show
  1. diffusion.py +9 -4
  2. perlmutter_diffusion.sbatch +7 -3
diffusion.py CHANGED
@@ -513,11 +513,16 @@ class DDPM21CM:
513
  #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}", f"{time()-lr_start:.3f}s")
514
  #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} print costs {print_end-print_start:.3f}s")
515
 
 
516
  acc_prep_start = time()
517
- self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
518
- self.accelerator.prepare(
519
- self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler
520
- )
 
 
 
 
521
  acc_prep_end = time()
522
  print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} accelerate.prepare cost {acc_prep_end-acc_prep_start:.3f}s")
523
  # self.nn_model, self.optimizer, self.lr_scheduler = \
 
513
  #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}", f"{time()-lr_start:.3f}s")
514
  #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} print costs {print_end-print_start:.3f}s")
515
 
516
+ print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}")
517
  acc_prep_start = time()
518
+ #self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
519
+ # self.accelerator.prepare(
520
+ # self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler
521
+ # )
522
+ self.nn_model = self.accelerator.prepare(self.nn_model)
523
+ self.optimizer = self.accelerator.prepare(self.optimizer)
524
+ self.dataloader = self.accelerator.prepare(self.dataloader)
525
+ self.lr_scheduler = self.accelerator.prepare(self.lr_scheduler)
526
  acc_prep_end = time()
527
  print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} accelerate.prepare cost {acc_prep_end-acc_prep_start:.3f}s")
528
  # self.nn_model, self.optimizer, self.lr_scheduler = \
perlmutter_diffusion.sbatch CHANGED
@@ -3,7 +3,7 @@
3
  #SBATCH -J diffusion
4
  #SBATCH -C gpu
5
  #SBATCH -q debug
6
- #SBATCH -N2
7
  #SBATCH --gpus-per-node=2
8
  #SBATCH -t 0:05:00
9
  #SBATCH --ntasks-per-node=1
@@ -14,11 +14,11 @@
14
  date
15
  #module load anaconda3/2022.05 # Load module dependencies
16
  module load pytorch #/2.0.1
17
- #conda activate diffusion #diffusers
18
  conda env list
19
  module list
20
  which python
21
- srun python -c "import torch; print(torch.cuda.device_count(), torch.__path__, torch.version.cuda); import accelerate; print(accelerate.__version__, accelerate.__path__)"
22
  cat $0
23
 
24
  MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
@@ -29,6 +29,10 @@ export MASTER_PORT=$MASTER_PORT
29
  #export SLURM_CPU_BIND="cores"
30
  echo $MASTER_ADDR
31
  echo $MASTER_PORT
 
 
 
 
32
 
33
  srun python diffusion.py \
34
  --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
 
3
  #SBATCH -J diffusion
4
  #SBATCH -C gpu
5
  #SBATCH -q debug
6
+ #SBATCH -N1
7
  #SBATCH --gpus-per-node=2
8
  #SBATCH -t 0:05:00
9
  #SBATCH --ntasks-per-node=1
 
14
  date
15
  #module load anaconda3/2022.05 # Load module dependencies
16
  module load pytorch #/2.0.1
17
+ #conda activate diffusers
18
  conda env list
19
  module list
20
  which python
21
+ srun python -c "import torch; print('device_count', torch.cuda.device_count(), 'torch.__path__', torch.__path__, 'cuda version', torch.version.cuda); import accelerate; print('accelerate.__version', accelerate.__version__, 'accelerate.__path__', accelerate.__path__)"
22
  cat $0
23
 
24
  MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 
29
  #export SLURM_CPU_BIND="cores"
30
  echo $MASTER_ADDR
31
  echo $MASTER_PORT
32
+ nc -zv $MASTER_ADDR $MASTER_PORT
33
+
34
+ export NCCL_DEBUG=INFO
35
+ export NCCL_DEBUG_SUBSYS=ALL
36
 
37
  srun python diffusion.py \
38
  --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \