Xsmos
/

ml21cm

TensorBoard

generate 21cm lightcones

denoising diffusion probabilistic model

Model card Files Files and versions

xet

Metrics Training metrics Community

Xsmos commited on Sep 4, 2024

Commit

f43e10e

verified ·

1 Parent(s): 74c3051

04144242

Browse files

Files changed (2) hide show

diffusion.py +5 -5
perlmutter_diffusion.sbatch +1 -1

diffusion.py CHANGED Viewed

@@ -504,9 +504,9 @@ class DDPM21CM:
         # print("!!!!!!!!!!!!!!!!, before prepare, self.dataloader.sampler =", self.dataloader.sampler)
         print(f"model: {self.nn_model.device}, cuda:{torch.cuda.current_device()}/{self.config.global_rank}")
         #print(f"optimizer: {self.optimizer.state_dict()}")
-        print_start = time()
         print(f"dataloader: {next(iter(self.dataloader))[0].device}")
         print(f"lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}")
         print_end = time()
@@ -527,7 +527,7 @@ class DDPM21CM:
         # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.sampler =", self.dataloader.sampler)
         # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.batch_sampler =", self.dataloader.batch_sampler)
         # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.DistributedSampler =", self.dataloader.DistributedSampler)
-        train_start = time()
         global_step = 0
         for ep in range(self.config.n_epoch):
             self.ddpm.train()
@@ -537,9 +537,9 @@ class DDPM21CM:
             #train_end = time()
             #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
             for i, (x, c) in enumerate(self.dataloader):
-                if i == 0:
-                    train_end = time()
-                    print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
                 # print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
                 with self.accelerator.accumulate(self.nn_model):

         # print("!!!!!!!!!!!!!!!!, before prepare, self.dataloader.sampler =", self.dataloader.sampler)
+        print_start = time()
         print(f"model: {self.nn_model.device}, cuda:{torch.cuda.current_device()}/{self.config.global_rank}")
         #print(f"optimizer: {self.optimizer.state_dict()}")
         print(f"dataloader: {next(iter(self.dataloader))[0].device}")
         print(f"lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}")
         print_end = time()
         # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.sampler =", self.dataloader.sampler)
         # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.batch_sampler =", self.dataloader.batch_sampler)
         # print("!!!!!!!!!!!!!!!!, after prepare, self.dataloader.DistributedSampler =", self.dataloader.DistributedSampler)
+        #train_start = time()
         global_step = 0
         for ep in range(self.config.n_epoch):
             self.ddpm.train()
             #train_end = time()
             #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
             for i, (x, c) in enumerate(self.dataloader):
+                #if i == 0:
+                #    train_end = time()
+                #    print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} ddpm.train costs {train_end-train_start:.3f}s")
                 # print(f"cuda:{torch.cuda.current_device()}, x[:,0,:2,0,0] =", x[:,0,:2,0,0])
                 with self.accelerator.accumulate(self.nn_model):

perlmutter_diffusion.sbatch CHANGED Viewed

@@ -18,7 +18,7 @@ conda activate diffusers
 conda env list
 module list
 which python
-srun python -c "import torch; print(torch.cuda.device_count(), torch.__path__, torch.version.cuda); import accelerate; print(accelerate.__version__, accelerate.__path__))"
 cat $0
 MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)

 conda env list
 module list
 which python
+srun python -c "import torch; print(torch.cuda.device_count(), torch.__path__, torch.version.cuda); import accelerate; print(accelerate.__version__, accelerate.__path__)"
 cat $0
 MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)