06224158
Browse files- diffusion.py +3 -1
- perlmutter_diffusion.sbatch +5 -4
- quantify_results.ipynb +0 -0
diffusion.py
CHANGED
|
@@ -272,7 +272,7 @@ class TrainConfig:
|
|
| 272 |
stride = (2,2) if dim == 2 else (2,2,2)
|
| 273 |
num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 274 |
batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
| 275 |
-
n_epoch =
|
| 276 |
HII_DIM = 64
|
| 277 |
num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
|
| 278 |
startat = 512-num_redshift
|
|
@@ -802,6 +802,7 @@ if __name__ == "__main__":
|
|
| 802 |
parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
|
| 803 |
parser.add_argument("--gradient_accumulation_steps", type=int, required=False, default=1) # as tested, higher value leads to slower training and higher loss in the end
|
| 804 |
parser.add_argument("--num_image", type=int, required=False, default=32)
|
|
|
|
| 805 |
parser.add_argument("--batch_size", type=int, required=False, default=2)
|
| 806 |
|
| 807 |
args = parser.parse_args()
|
|
@@ -815,6 +816,7 @@ if __name__ == "__main__":
|
|
| 815 |
config = TrainConfig()
|
| 816 |
config.gradient_accumulation_steps = args.gradient_accumulation_steps
|
| 817 |
config.num_image = args.num_image
|
|
|
|
| 818 |
config.batch_size = args.batch_size
|
| 819 |
############################ training ################################
|
| 820 |
if args.train:
|
|
|
|
| 272 |
stride = (2,2) if dim == 2 else (2,2,2)
|
| 273 |
num_image = 32#0#0#640#320#6400#3000#480#1200#120#3000#300#3000#6000#30#60#6000#1000#2000#20000#15000#7000#25600#3000#10000#1000#10000#5000#2560#800#2560
|
| 274 |
batch_size = 1#1#10#50#10#50#20#50#1#2#50#20#2#100 # 10
|
| 275 |
+
n_epoch = 100#30#50#20#1#50#10#1#50#1#50#5#50#5#50#100#50#100#30#120#5#4# 10#50#20#20#2#5#25 # 120
|
| 276 |
HII_DIM = 64
|
| 277 |
num_redshift = 64#256#512#256#512#256#512#256#512#64#512#64#512#64#256CUDAoom#128#64#512#128#64#512#256#256#64#512#128
|
| 278 |
startat = 512-num_redshift
|
|
|
|
| 802 |
parser.add_argument("--max_num_img_per_gpu", type=int, required=False, default=2)
|
| 803 |
parser.add_argument("--gradient_accumulation_steps", type=int, required=False, default=1) # as tested, higher value leads to slower training and higher loss in the end
|
| 804 |
parser.add_argument("--num_image", type=int, required=False, default=32)
|
| 805 |
+
parser.add_argument("--n_epoch", type=int, required=False, default=50)
|
| 806 |
parser.add_argument("--batch_size", type=int, required=False, default=2)
|
| 807 |
|
| 808 |
args = parser.parse_args()
|
|
|
|
| 816 |
config = TrainConfig()
|
| 817 |
config.gradient_accumulation_steps = args.gradient_accumulation_steps
|
| 818 |
config.num_image = args.num_image
|
| 819 |
+
config.n_epoch = args.n_epoch
|
| 820 |
config.batch_size = args.batch_size
|
| 821 |
############################ training ################################
|
| 822 |
if args.train:
|
perlmutter_diffusion.sbatch
CHANGED
|
@@ -2,10 +2,10 @@
|
|
| 2 |
#SBATCH -A m4717
|
| 3 |
#SBATCH -J diffusion
|
| 4 |
#SBATCH -C gpu
|
| 5 |
-
#SBATCH -q shared
|
| 6 |
-
#SBATCH -
|
| 7 |
#SBATCH --gpus-per-node=1
|
| 8 |
-
#SBATCH -t 0:
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
@@ -38,9 +38,10 @@ srun python diffusion.py \
|
|
| 38 |
--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
|
| 39 |
--num_image 3200 \
|
| 40 |
--batch_size 32 \
|
|
|
|
| 41 |
--gradient_accumulation_steps 1 \
|
| 42 |
--num_new_img_per_gpu 320 \
|
| 43 |
--max_num_img_per_gpu 32 \
|
| 44 |
-
#--resume outputs/model-N3200-device_count1-node1-
|
| 45 |
|
| 46 |
date
|
|
|
|
| 2 |
#SBATCH -A m4717
|
| 3 |
#SBATCH -J diffusion
|
| 4 |
#SBATCH -C gpu
|
| 5 |
+
#SBATCH -q regular #shared
|
| 6 |
+
#SBATCH -N4
|
| 7 |
#SBATCH --gpus-per-node=1
|
| 8 |
+
#SBATCH -t 0:50:00
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
|
|
| 38 |
--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
|
| 39 |
--num_image 3200 \
|
| 40 |
--batch_size 32 \
|
| 41 |
+
--n_epoch 100 \
|
| 42 |
--gradient_accumulation_steps 1 \
|
| 43 |
--num_new_img_per_gpu 320 \
|
| 44 |
--max_num_img_per_gpu 32 \
|
| 45 |
+
#--resume outputs/model-N3200-device_count1-node1-epoch99-06161732 \
|
| 46 |
|
| 47 |
date
|
quantify_results.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|