Xsmos commited on
Commit
fce0fb3
·
verified ·
1 Parent(s): 48b8ffd

0803-182144

Browse files
diffusion.py CHANGED
@@ -260,7 +260,7 @@ class TrainConfig:
260
  stride = (2,4) if dim == 2 else (2,2,2)
261
  num_image = 3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
262
  batch_size = 10#50#10#50#20#50#1#2#50#20#2#100 # 10
263
- n_epoch = 50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
264
  HII_DIM = 64
265
  num_redshift = 512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
266
  channel = 1
@@ -711,7 +711,7 @@ def generate_samples(rank, world_size, local_world_size, master_addr, master_por
711
  if __name__ == "__main__":
712
  parser = argparse.ArgumentParser()
713
  parser.add_argument("--train", type=int, required=False, help="whether to train the model", default=1)
714
- parser.add_argument("--sample", type=int, required=False, help="whether to sample", default=0)
715
  parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
716
  parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
717
  parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
@@ -725,7 +725,7 @@ if __name__ == "__main__":
725
  world_size = local_world_size * total_nodes #6#int(os.environ["SLURM_NTASKS"])
726
 
727
  ############################ training ################################
728
- if args.train:
729
  print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
730
  mp.spawn(
731
  train,
@@ -734,7 +734,7 @@ if __name__ == "__main__":
734
  join=True,
735
  )
736
  ############################ sampling ################################
737
- if args.sample:
738
  num_new_img_per_gpu = args.num_new_img_per_gpu#200#4#200
739
  max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
740
  config = TrainConfig()
@@ -748,10 +748,10 @@ if __name__ == "__main__":
748
  # return_dict = manager.dict()
749
  params_pairs = [
750
  (4.4, 131.341),
751
- #(5.6, 19.037),
752
- #(4.699, 30),
753
- #(5.477, 200),
754
- #(4.8, 131.341),
755
  ]
756
 
757
  for params in params_pairs:
 
260
  stride = (2,4) if dim == 2 else (2,2,2)
261
  num_image = 3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
262
  batch_size = 10#50#10#50#20#50#1#2#50#20#2#100 # 10
263
+ n_epoch = 1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
264
  HII_DIM = 64
265
  num_redshift = 512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
266
  channel = 1
 
711
  if __name__ == "__main__":
712
  parser = argparse.ArgumentParser()
713
  parser.add_argument("--train", type=int, required=False, help="whether to train the model", default=1)
714
+ #parser.add_argument("--sample", type=int, required=False, help="whether to sample", default=0)
715
  parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
716
  parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
717
  parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
 
725
  world_size = local_world_size * total_nodes #6#int(os.environ["SLURM_NTASKS"])
726
 
727
  ############################ training ################################
728
+ if args.train == 1:
729
  print(f" training, ip_addr = {socket.gethostbyname(socket.gethostname())}, master_addr = {master_addr}, local_world_size = {local_world_size}, world_size = {world_size} ".center(120,'-'))
730
  mp.spawn(
731
  train,
 
734
  join=True,
735
  )
736
  ############################ sampling ################################
737
+ if args.train == 0:
738
  num_new_img_per_gpu = args.num_new_img_per_gpu#200#4#200
739
  max_num_img_per_gpu = args.max_num_img_per_gpu#40#2#20
740
  config = TrainConfig()
 
748
  # return_dict = manager.dict()
749
  params_pairs = [
750
  (4.4, 131.341),
751
+ (5.6, 19.037),
752
+ (4.699, 30),
753
+ (5.477, 200),
754
+ (4.8, 131.341),
755
  ]
756
 
757
  for params in params_pairs:
phoenix_diffusion.sbatch CHANGED
@@ -2,10 +2,10 @@
2
  #SBATCH -J diffusion # Job name
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
- #SBATCH -N1 --gpus-per-node=RTX_6000:2 # -C A100-80GB # Number of nodes and cores per node required
6
  #SBATCH --ntasks-per-node=1
7
- #SBATCH --mem-per-gpu=16G # Memory per core
8
- #SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins)
9
  #SBATCH -oReport-%j # Combined output and error messages file
10
  #SBATCH --error=error-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
@@ -30,10 +30,9 @@ export MASTER_PORT=$MASTER_PORT
30
 
31
  srun python diffusion.py \
32
  --train 1 \
33
- --sample 0 \
34
- --resume outputs/model_state-N3000-device_count3-node2-epoch49-172.27.145.67 \
35
- --num_new_img_per_gpu 200 \
36
- --max_num_img_per_gpu 40 \
37
 
38
  ######################################################################################
39
 
 
2
  #SBATCH -J diffusion # Job name
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
+ #SBATCH -N8 --gpus-per-node=RTX_6000:1 # -C A100-80GB # Number of nodes and cores per node required
6
  #SBATCH --ntasks-per-node=1
7
+ #SBATCH --mem-per-gpu=8G # Memory per core
8
+ #SBATCH -t 00:05:00 # Duration of the job (Ex: 15 mins)
9
  #SBATCH -oReport-%j # Combined output and error messages file
10
  #SBATCH --error=error-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
 
30
 
31
  srun python diffusion.py \
32
  --train 1 \
33
+ --resume outputs/model_state-N3000-device_count1-node4-epoch49-172.27.149.191 \
34
+ --num_new_img_per_gpu 50 \
35
+ --max_num_img_per_gpu 10 \
 
36
 
37
  ######################################################################################
38
 
quantify_results.ipynb CHANGED
The diff for this file is too large to render. See raw diff