Xsmos commited on
Commit
18afafd
·
verified ·
1 Parent(s): 13fe8cf

0816-110239

Browse files
Files changed (2) hide show
  1. diffusion.py +4 -4
  2. phoenix_diffusion.sbatch +3 -3
diffusion.py CHANGED
@@ -269,9 +269,9 @@ class TrainConfig:
269
  # dim = 2
270
  dim = 3#2
271
  stride = (2,4) if dim == 2 else (2,2,2)
272
- num_image = 300#0#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
273
- batch_size = 1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
274
- n_epoch = 50#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
275
  HII_DIM = 64
276
  num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
277
  channel = 1
@@ -743,7 +743,7 @@ if __name__ == "__main__":
743
  parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
744
  parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
745
  parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
746
- parser.add_argument("--gradient_accumulation_steps", type=int, required=False, default=1)
747
 
748
  args = parser.parse_args()
749
 
 
269
  # dim = 2
270
  dim = 3#2
271
  stride = (2,4) if dim == 2 else (2,2,2)
272
+ num_image = 3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
273
+ batch_size = 2#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
274
+ n_epoch = 40#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
275
  HII_DIM = 64
276
  num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
277
  channel = 1
 
743
  parser.add_argument("--resume", type=str, required=False, help="filename of the model to resume", default=False)
744
  parser.add_argument("--num_new_img_per_gpu", type=int, required=False, default=4)
745
  parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
746
+ parser.add_argument("--gradient_accumulation_steps", type=int, required=False, default=1) # as tested, higher value leads to slower training and higher loss in the end
747
 
748
  args = parser.parse_args()
749
 
phoenix_diffusion.sbatch CHANGED
@@ -2,10 +2,10 @@
2
  #SBATCH -J diffusion # Job name
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
- #SBATCH -N1 --gpus-per-node=V100:1 -C V100-16GB # Number of nodes and cores per node required
6
  #SBATCH --ntasks-per-node=1
7
  #SBATCH --mem-per-gpu=16G # Memory per core
8
- #SBATCH -t 01:00:00 # Duration of the job (Ex: 15 mins)
9
  #SBATCH -oReport-%j # Combined output and error messages file
10
  #SBATCH --error=error-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
@@ -34,6 +34,6 @@ srun python diffusion.py \
34
  --resume outputs/model_state-N3000-device_count1-node2-epoch49-172.27.149.67 \
35
  --num_new_img_per_gpu 50 \
36
  --max_num_img_per_gpu 5 \
37
- --gradient_accumulation_steps 100 \
38
  ######################################################################################
39
 
 
2
  #SBATCH -J diffusion # Job name
3
  #SBATCH -A gts-jw254-coda20
4
  #SBATCH -qembers
5
+ #SBATCH -N4 --gpus-per-node=V100:1 -C V100-16GB # Number of nodes and cores per node required
6
  #SBATCH --ntasks-per-node=1
7
  #SBATCH --mem-per-gpu=16G # Memory per core
8
+ #SBATCH -t 08:00:00 # Duration of the job (Ex: 15 mins)
9
  #SBATCH -oReport-%j # Combined output and error messages file
10
  #SBATCH --error=error-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL # Mail preferences
 
34
  --resume outputs/model_state-N3000-device_count1-node2-epoch49-172.27.149.67 \
35
  --num_new_img_per_gpu 50 \
36
  --max_num_img_per_gpu 5 \
37
+ --gradient_accumulation_steps 1 \
38
  ######################################################################################
39