06161732
Browse files- diffusion.py +14 -14
- load_h5.py +1 -1
- perlmutter_diffusion.sbatch +4 -4
- quantify_results.ipynb +0 -0
diffusion.py
CHANGED
|
@@ -272,7 +272,7 @@ class TrainConfig:
|
|
| 272 |
stride = (2,2) if dim == 2 else (2,2,2)
|
| 273 |
num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 274 |
batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
| 275 |
-
n_epoch = 30#50#20#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
|
| 276 |
HII_DIM = 64
|
| 277 |
num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
|
| 278 |
startat = 512-num_redshift
|
|
@@ -516,12 +516,12 @@ class DDPM21CM:
|
|
| 516 |
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}", f"{time()-lr_start:.3f}s")
|
| 517 |
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} print costs {print_end-print_start:.3f}s")
|
| 518 |
if torch.distributed.is_initialized():
|
| 519 |
-
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized")
|
| 520 |
torch.distributed.barrier()
|
| 521 |
else:
|
| 522 |
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized False!!!!!!!!!!!!!!!")
|
| 523 |
|
| 524 |
-
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}; nn_model.device = {self.nn_model.device}")
|
| 525 |
#acc_prep_start = time()
|
| 526 |
#self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
|
| 527 |
# self.accelerator.prepare(
|
|
@@ -702,18 +702,18 @@ class DDPM21CM:
|
|
| 702 |
# nn_model.train()
|
| 703 |
# self.nn_model.to(self.ddpm.device)
|
| 704 |
|
| 705 |
-
self.accelerator = Accelerator(
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
)
|
| 712 |
|
| 713 |
-
self.nn_model, self.optimizer, self.lr_scheduler = \
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
|
| 718 |
self.nn_model.eval()
|
| 719 |
|
|
|
|
| 272 |
stride = (2,2) if dim == 2 else (2,2,2)
|
| 273 |
num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 274 |
batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
| 275 |
+
n_epoch = 200#30#50#20#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
|
| 276 |
HII_DIM = 64
|
| 277 |
num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
|
| 278 |
startat = 512-num_redshift
|
|
|
|
| 516 |
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}", f"{time()-lr_start:.3f}s")
|
| 517 |
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} print costs {print_end-print_start:.3f}s")
|
| 518 |
if torch.distributed.is_initialized():
|
| 519 |
+
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized")
|
| 520 |
torch.distributed.barrier()
|
| 521 |
else:
|
| 522 |
print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized False!!!!!!!!!!!!!!!")
|
| 523 |
|
| 524 |
+
#print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}; nn_model.device = {self.nn_model.device}")
|
| 525 |
#acc_prep_start = time()
|
| 526 |
#self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
|
| 527 |
# self.accelerator.prepare(
|
|
|
|
| 702 |
# nn_model.train()
|
| 703 |
# self.nn_model.to(self.ddpm.device)
|
| 704 |
|
| 705 |
+
#self.accelerator = Accelerator(
|
| 706 |
+
# mixed_precision=self.config.mixed_precision,
|
| 707 |
+
# gradient_accumulation_steps=self.config.gradient_accumulation_steps,
|
| 708 |
+
# log_with="tensorboard",
|
| 709 |
+
# project_dir=os.path.join(self.config.output_dir, "logs"),
|
| 710 |
+
# # distributed_type="MULTI_GPU",
|
| 711 |
+
#)
|
| 712 |
|
| 713 |
+
#self.nn_model, self.optimizer, self.lr_scheduler = \
|
| 714 |
+
# self.accelerator.prepare(
|
| 715 |
+
# self.nn_model, self.optimizer, self.lr_scheduler
|
| 716 |
+
# )
|
| 717 |
|
| 718 |
self.nn_model.eval()
|
| 719 |
|
load_h5.py
CHANGED
|
@@ -43,7 +43,7 @@ class Dataset4h5(Dataset):
|
|
| 43 |
dim=2,
|
| 44 |
transform=True,
|
| 45 |
ranges_dict=None,
|
| 46 |
-
num_workers=len(os.sched_getaffinity(0))//torch.cuda.device_count(),
|
| 47 |
startat=0,
|
| 48 |
# shuffle=False,
|
| 49 |
):
|
|
|
|
| 43 |
dim=2,
|
| 44 |
transform=True,
|
| 45 |
ranges_dict=None,
|
| 46 |
+
num_workers=1,#len(os.sched_getaffinity(0))//torch.cuda.device_count(),
|
| 47 |
startat=0,
|
| 48 |
# shuffle=False,
|
| 49 |
):
|
perlmutter_diffusion.sbatch
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
#SBATCH -q shared #regular
|
| 6 |
#SBATCH -N1
|
| 7 |
#SBATCH --gpus-per-node=1
|
| 8 |
-
#SBATCH -t 0:
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
@@ -39,8 +39,8 @@ srun python diffusion.py \
|
|
| 39 |
--num_image 3200 \
|
| 40 |
--batch_size 32 \
|
| 41 |
--gradient_accumulation_steps 1 \
|
| 42 |
-
--num_new_img_per_gpu
|
| 43 |
-
--max_num_img_per_gpu
|
| 44 |
-
#--resume outputs/model-
|
| 45 |
|
| 46 |
date
|
|
|
|
| 5 |
#SBATCH -q shared #regular
|
| 6 |
#SBATCH -N1
|
| 7 |
#SBATCH --gpus-per-node=1
|
| 8 |
+
#SBATCH -t 0:59:00
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
|
|
| 39 |
--num_image 3200 \
|
| 40 |
--batch_size 32 \
|
| 41 |
--gradient_accumulation_steps 1 \
|
| 42 |
+
--num_new_img_per_gpu 320 \
|
| 43 |
+
--max_num_img_per_gpu 32 \
|
| 44 |
+
#--resume outputs/model-N3200-device_count1-node1-epoch29-06121554 \
|
| 45 |
|
| 46 |
date
|
quantify_results.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|