04144242
Browse files- diffusion.py +5 -5
- perlmutter_diffusion.sbatch +1 -1
diffusion.py
CHANGED
|
@@ -504,9 +504,9 @@ class DDPM21CM:
|
|
| 504 |
|
| 505 |
|
| 506 |
# print("!!!!!!!!!!!!!!!!, before prepare, self.dataloader.sampler =", self.dataloader.sampler)
|
|
|
|
| 507 |
print(f"model: {self.nn_model.device}, cuda:{torch.cuda.current_device()}/{self.config.global_rank}")
|
| 508 |
#print(f"optimizer: {self.optimizer.state_dict()}")
|
| 509 |
-
print_start = time()
|
| 510 |
print(f"dataloader: {next(iter(self.dataloader))[0].device}")
|
| 511 |
print(f"lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}")
|
| 512 |
print_end = time()
|
|
@@ -527,7 +527,7 @@ class DDPM21CM:
|
|
| 527 |
# print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.sampler =", self.dataloader.sampler)
|
| 528 |
# print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.batch_sampler =", self.dataloader.batch_sampler)
|
| 529 |
# print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.DistributedSampler =", self.dataloader.DistributedSampler)
|
| 530 |
-
train_start = time()
|
| 531 |
global_step = 0
|
| 532 |
for ep in range(self.config.n_epoch):
|
| 533 |
self.ddpm.train()
|
|
@@ -537,9 +537,9 @@ class DDPM21CM:
|
|
| 537 |
#train_end = time()
|
| 538 |
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
|
| 539 |
for i, (x, c) in enumerate(self.dataloader):
|
| 540 |
-
if i == 0:
|
| 541 |
-
|
| 542 |
-
|
| 543 |
|
| 544 |
# print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
|
| 545 |
with self.accelerator.accumulate(self.nn_model):
|
|
|
|
| 504 |
|
| 505 |
|
| 506 |
# print("!!!!!!!!!!!!!!!!, before prepare, self.dataloader.sampler =", self.dataloader.sampler)
|
| 507 |
+
print_start = time()
|
| 508 |
print(f"model: {self.nn_model.device}, cuda:{torch.cuda.current_device()}/{self.config.global_rank}")
|
| 509 |
#print(f"optimizer: {self.optimizer.state_dict()}")
|
|
|
|
| 510 |
print(f"dataloader: {next(iter(self.dataloader))[0].device}")
|
| 511 |
print(f"lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}")
|
| 512 |
print_end = time()
|
|
|
|
| 527 |
# print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.sampler =", self.dataloader.sampler)
|
| 528 |
# print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.batch_sampler =", self.dataloader.batch_sampler)
|
| 529 |
# print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.DistributedSampler =", self.dataloader.DistributedSampler)
|
| 530 |
+
#train_start = time()
|
| 531 |
global_step = 0
|
| 532 |
for ep in range(self.config.n_epoch):
|
| 533 |
self.ddpm.train()
|
|
|
|
| 537 |
#train_end = time()
|
| 538 |
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
|
| 539 |
for i, (x, c) in enumerate(self.dataloader):
|
| 540 |
+
#if i == 0:
|
| 541 |
+
# train_end = time()
|
| 542 |
+
# print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
|
| 543 |
|
| 544 |
# print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
|
| 545 |
with self.accelerator.accumulate(self.nn_model):
|
perlmutter_diffusion.sbatch
CHANGED
|
@@ -18,7 +18,7 @@ conda activate diffusers
|
|
| 18 |
conda env list
|
| 19 |
module list
|
| 20 |
which python
|
| 21 |
-
srun python -c "import torch; print(torch.cuda.device_count(), torch.__path__, torch.version.cuda); import accelerate; print(accelerate.__version__, accelerate.__path__)
|
| 22 |
cat $0
|
| 23 |
|
| 24 |
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
|
|
|
|
| 18 |
conda env list
|
| 19 |
module list
|
| 20 |
which python
|
| 21 |
+
srun python -c "import torch; print(torch.cuda.device_count(), torch.__path__, torch.version.cuda); import accelerate; print(accelerate.__version__, accelerate.__path__)"
|
| 22 |
cat $0
|
| 23 |
|
| 24 |
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
|