Xsmos commited on
Commit
7f7e11f
·
verified ·
1 Parent(s): 7dfd906
diffusion.py CHANGED
@@ -272,7 +272,7 @@ class TrainConfig:
272
  stride = (2,2) if dim == 2 else (2,2,2)
273
  num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
274
  batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
275
- n_epoch = 200#30#50#20#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
276
  HII_DIM = 64
277
  num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
278
  startat = 512-num_redshift
@@ -802,6 +802,7 @@ if __name__ == "__main__":
802
  parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
803
  parser.add_argument("--gradient_accumulation_steps", type=int, required=False, default=1) # as tested, higher value leads to slower training and higher loss in the end
804
  parser.add_argument("--num_image", type=int, required=False, default=32)
 
805
  parser.add_argument("--batch_size", type=int, required=False, default=2)
806
 
807
  args = parser.parse_args()
@@ -815,6 +816,7 @@ if __name__ == "__main__":
815
  config = TrainConfig()
816
  config.gradient_accumulation_steps = args.gradient_accumulation_steps
817
  config.num_image = args.num_image
 
818
  config.batch_size = args.batch_size
819
  ############################ training ################################
820
  if args.train:
 
272
  stride = (2,2) if dim == 2 else (2,2,2)
273
  num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
274
  batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
275
+ n_epoch = 100#30#50#20#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
276
  HII_DIM = 64
277
  num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
278
  startat = 512-num_redshift
 
802
  parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
803
  parser.add_argument("--gradient_accumulation_steps", type=int, required=False, default=1) # as tested, higher value leads to slower training and higher loss in the end
804
  parser.add_argument("--num_image", type=int, required=False, default=32)
805
+ parser.add_argument("--n_epoch", type=int, required=False, default=50)
806
  parser.add_argument("--batch_size", type=int, required=False, default=2)
807
 
808
  args = parser.parse_args()
 
816
  config = TrainConfig()
817
  config.gradient_accumulation_steps = args.gradient_accumulation_steps
818
  config.num_image = args.num_image
819
+ config.n_epoch = args.n_epoch
820
  config.batch_size = args.batch_size
821
  ############################ training ################################
822
  if args.train:
perlmutter_diffusion.sbatch CHANGED
@@ -2,10 +2,10 @@
2
  #SBATCH -A m4717
3
  #SBATCH -J diffusion
4
  #SBATCH -C gpu
5
- #SBATCH -q shared #regular
6
- #SBATCH -N1
7
  #SBATCH --gpus-per-node=1
8
- #SBATCH -t 0:59:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
@@ -38,9 +38,10 @@ srun python diffusion.py \
38
  --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
39
  --num_image 3200 \
40
  --batch_size 32 \
 
41
  --gradient_accumulation_steps 1 \
42
  --num_new_img_per_gpu 320 \
43
  --max_num_img_per_gpu 32 \
44
- #--resume outputs/model-N3200-device_count1-node1-epoch29-06121554 \
45
 
46
  date
 
2
  #SBATCH -A m4717
3
  #SBATCH -J diffusion
4
  #SBATCH -C gpu
5
+ #SBATCH -q regular #shared
6
+ #SBATCH -N4
7
  #SBATCH --gpus-per-node=1
8
+ #SBATCH -t 0:50:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
 
38
  --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
39
  --num_image 3200 \
40
  --batch_size 32 \
41
+ --n_epoch 100 \
42
  --gradient_accumulation_steps 1 \
43
  --num_new_img_per_gpu 320 \
44
  --max_num_img_per_gpu 32 \
45
+ #--resume outputs/model-N3200-device_count1-node1-epoch99-06161732 \
46
 
47
  date
quantify_results.ipynb CHANGED
The diff for this file is too large to render. See raw diff