Xsmos commited on
Commit
2a4802c
·
verified ·
1 Parent(s): 6077f4f
diffusion.py CHANGED
@@ -272,7 +272,7 @@ class TrainConfig:
272
  stride = (2,2) if dim == 2 else (2,2,2)
273
  num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
274
  batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
275
- n_epoch = 30#50#20#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
276
  HII_DIM = 64
277
  num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
278
  startat = 512-num_redshift
@@ -516,12 +516,12 @@ class DDPM21CM:
516
  #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}", f"{time()-lr_start:.3f}s")
517
  #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} print costs {print_end-print_start:.3f}s")
518
  if torch.distributed.is_initialized():
519
- print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized")
520
  torch.distributed.barrier()
521
  else:
522
  print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized False!!!!!!!!!!!!!!!")
523
 
524
- print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}; nn_model.device = {self.nn_model.device}")
525
  #acc_prep_start = time()
526
  #self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
527
  # self.accelerator.prepare(
@@ -702,18 +702,18 @@ class DDPM21CM:
702
  # nn_model.train()
703
  # self.nn_model.to(self.ddpm.device)
704
 
705
- self.accelerator = Accelerator(
706
- mixed_precision=self.config.mixed_precision,
707
- gradient_accumulation_steps=self.config.gradient_accumulation_steps,
708
- log_with="tensorboard",
709
- project_dir=os.path.join(self.config.output_dir, "logs"),
710
- # distributed_type="MULTI_GPU",
711
- )
712
 
713
- self.nn_model, self.optimizer, self.lr_scheduler = \
714
- self.accelerator.prepare(
715
- self.nn_model, self.optimizer, self.lr_scheduler
716
- )
717
 
718
  self.nn_model.eval()
719
 
 
272
  stride = (2,2) if dim == 2 else (2,2,2)
273
  num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
274
  batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
275
+ n_epoch = 200#30#50#20#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
276
  HII_DIM = 64
277
  num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
278
  startat = 512-num_redshift
 
516
  #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} lr_scheduler: {self.lr_scheduler.optimizer is self.optimizer}", f"{time()-lr_start:.3f}s")
517
  #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} print costs {print_end-print_start:.3f}s")
518
  if torch.distributed.is_initialized():
519
+ #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized")
520
  torch.distributed.barrier()
521
  else:
522
  print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank} torch.distributed.is_initialized False!!!!!!!!!!!!!!!")
523
 
524
+ #print(f"cuda:{torch.cuda.current_device()}/{self.config.global_rank}; nn_model.device = {self.nn_model.device}")
525
  #acc_prep_start = time()
526
  #self.nn_model, self.optimizer, self.dataloader, self.lr_scheduler = \
527
  # self.accelerator.prepare(
 
702
  # nn_model.train()
703
  # self.nn_model.to(self.ddpm.device)
704
 
705
+ #self.accelerator = Accelerator(
706
+ # mixed_precision=self.config.mixed_precision,
707
+ # gradient_accumulation_steps=self.config.gradient_accumulation_steps,
708
+ # log_with="tensorboard",
709
+ # project_dir=os.path.join(self.config.output_dir, "logs"),
710
+ # # distributed_type="MULTI_GPU",
711
+ #)
712
 
713
+ #self.nn_model, self.optimizer, self.lr_scheduler = \
714
+ # self.accelerator.prepare(
715
+ # self.nn_model, self.optimizer, self.lr_scheduler
716
+ # )
717
 
718
  self.nn_model.eval()
719
 
load_h5.py CHANGED
@@ -43,7 +43,7 @@ class Dataset4h5(Dataset):
43
  dim=2,
44
  transform=True,
45
  ranges_dict=None,
46
- num_workers=len(os.sched_getaffinity(0))//torch.cuda.device_count(),
47
  startat=0,
48
  # shuffle=False,
49
  ):
 
43
  dim=2,
44
  transform=True,
45
  ranges_dict=None,
46
+ num_workers=1,#len(os.sched_getaffinity(0))//torch.cuda.device_count(),
47
  startat=0,
48
  # shuffle=False,
49
  ):
perlmutter_diffusion.sbatch CHANGED
@@ -5,7 +5,7 @@
5
  #SBATCH -q shared #regular
6
  #SBATCH -N1
7
  #SBATCH --gpus-per-node=1
8
- #SBATCH -t 0:10:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
@@ -39,8 +39,8 @@ srun python diffusion.py \
39
  --num_image 3200 \
40
  --batch_size 32 \
41
  --gradient_accumulation_steps 1 \
42
- --num_new_img_per_gpu 50 \
43
- --max_num_img_per_gpu 2 \
44
- #--resume outputs/model-N2000-device_count1-node8-epoch19-19004529 \
45
 
46
  date
 
5
  #SBATCH -q shared #regular
6
  #SBATCH -N1
7
  #SBATCH --gpus-per-node=1
8
+ #SBATCH -t 0:59:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
 
39
  --num_image 3200 \
40
  --batch_size 32 \
41
  --gradient_accumulation_steps 1 \
42
+ --num_new_img_per_gpu 320 \
43
+ --max_num_img_per_gpu 32 \
44
+ #--resume outputs/model-N3200-device_count1-node1-epoch29-06121554 \
45
 
46
  date
quantify_results.ipynb CHANGED
The diff for this file is too large to render. See raw diff