05140800
Browse files- diffusion.py +9 -4
- perlmutter_diffusion.sbatch +7 -3
diffusion.py
CHANGED
|
@@ -513,11 +513,16 @@ class DDPM21CM:
|
|
| 513 |
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}", f"{time()-lr_start:.3f}s")
|
| 514 |
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} print costs {print_end-print_start:.3f}s")
|
| 515 |
|
|
|
|
| 516 |
acc_prep_start = time()
|
| 517 |
-
self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
acc_prep_end = time()
|
| 522 |
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} accelerate.prepare cost {acc_prep_end-acc_prep_start:.3f}s")
|
| 523 |
# self.nn_model, self.optimizer, self.lr_scheduler = \
|
|
|
|
| 513 |
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}", f"{time()-lr_start:.3f}s")
|
| 514 |
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} print costs {print_end-print_start:.3f}s")
|
| 515 |
|
| 516 |
+
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}")
|
| 517 |
acc_prep_start = time()
|
| 518 |
+
#self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
|
| 519 |
+
# self.accelerator.prepare(
|
| 520 |
+
# self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler
|
| 521 |
+
# )
|
| 522 |
+
self.nn_model = self.accelerator.prepare(self.nn_model)
|
| 523 |
+
self.optimizer = self.accelerator.prepare(self.optimizer)
|
| 524 |
+
self.dataloader = self.accelerator.prepare(self.dataloader)
|
| 525 |
+
self.lr_scheduler = self.accelerator.prepare(self.lr_scheduler)
|
| 526 |
acc_prep_end = time()
|
| 527 |
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} accelerate.prepare cost {acc_prep_end-acc_prep_start:.3f}s")
|
| 528 |
# self.nn_model, self.optimizer, self.lr_scheduler = \
|
perlmutter_diffusion.sbatch
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
#SBATCH -J diffusion
|
| 4 |
#SBATCH -C gpu
|
| 5 |
#SBATCH -q debug
|
| 6 |
-
#SBATCH -
|
| 7 |
#SBATCH --gpus-per-node=2
|
| 8 |
#SBATCH -t 0:05:00
|
| 9 |
#SBATCH --ntasks-per-node=1
|
|
@@ -14,11 +14,11 @@
|
|
| 14 |
date
|
| 15 |
#module load anaconda3/2022.05 # Load module dependencies
|
| 16 |
module load pytorch #/2.0.1
|
| 17 |
-
#conda activate
|
| 18 |
conda env list
|
| 19 |
module list
|
| 20 |
which python
|
| 21 |
-
srun python -c "import torch; print(torch.cuda.device_count(), torch.__path__, torch.version.cuda); import accelerate; print(accelerate.__version__, accelerate.__path__)"
|
| 22 |
cat $0
|
| 23 |
|
| 24 |
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
|
|
@@ -29,6 +29,10 @@ export MASTER_PORT=$MASTER_PORT
|
|
| 29 |
#export SLURM_CPU_BIND="cores"
|
| 30 |
echo $MASTER_ADDR
|
| 31 |
echo $MASTER_PORT
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
srun python diffusion.py \
|
| 34 |
--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
|
|
|
|
| 3 |
#SBATCH -J diffusion
|
| 4 |
#SBATCH -C gpu
|
| 5 |
#SBATCH -q debug
|
| 6 |
+
#SBATCH -N1
|
| 7 |
#SBATCH --gpus-per-node=2
|
| 8 |
#SBATCH -t 0:05:00
|
| 9 |
#SBATCH --ntasks-per-node=1
|
|
|
|
| 14 |
date
|
| 15 |
#module load anaconda3/2022.05 # Load module dependencies
|
| 16 |
module load pytorch #/2.0.1
|
| 17 |
+
#conda activate diffusers
|
| 18 |
conda env list
|
| 19 |
module list
|
| 20 |
which python
|
| 21 |
+
srun python -c "import torch; print('device_count', torch.cuda.device_count(), 'torch.__path__', torch.__path__, 'cuda version', torch.version.cuda); import accelerate; print('accelerate.__version', accelerate.__version__, 'accelerate.__path__', accelerate.__path__)"
|
| 22 |
cat $0
|
| 23 |
|
| 24 |
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
|
|
|
|
| 29 |
#export SLURM_CPU_BIND="cores"
|
| 30 |
echo $MASTER_ADDR
|
| 31 |
echo $MASTER_PORT
|
| 32 |
+
nc -zv $MASTER_ADDR $MASTER_PORT
|
| 33 |
+
|
| 34 |
+
export NCCL_DEBUG=INFO
|
| 35 |
+
export NCCL_DEBUG_SUBSYS=ALL
|
| 36 |
|
| 37 |
srun python diffusion.py \
|
| 38 |
--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
|