32353789
Browse files- diffusion.py +1 -1
- perlmutter_diffusion.sbatch +3 -3
diffusion.py
CHANGED
|
@@ -278,7 +278,7 @@ class TrainConfig:
|
|
| 278 |
# seed = 0
|
| 279 |
# save_dir = './outputs/'
|
| 280 |
|
| 281 |
-
save_period =
|
| 282 |
# general parameters for the name and logger
|
| 283 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 284 |
lrate = 1e-4
|
|
|
|
| 278 |
# seed = 0
|
| 279 |
# save_dir = './outputs/'
|
| 280 |
|
| 281 |
+
save_period = 10 #np.infty #n_epoch // 2 #np.infty#.1 # the period of sampling
|
| 282 |
# general parameters for the name and logger
|
| 283 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 284 |
lrate = 1e-4
|
perlmutter_diffusion.sbatch
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
#SBATCH -q regular #shared
|
| 6 |
#SBATCH -N1
|
| 7 |
#SBATCH --gpus-per-node=4
|
| 8 |
-
#SBATCH -t
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
@@ -39,7 +39,7 @@ srun python diffusion.py \
|
|
| 39 |
--use_checkpoint 1 \
|
| 40 |
--dropout 0.2 \
|
| 41 |
--lrate 2e-5 \
|
| 42 |
-
--
|
| 43 |
-
#--
|
| 44 |
|
| 45 |
date
|
|
|
|
| 5 |
#SBATCH -q regular #shared
|
| 6 |
#SBATCH -N1
|
| 7 |
#SBATCH --gpus-per-node=4
|
| 8 |
+
#SBATCH -t 3:00:00
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
|
|
| 39 |
--use_checkpoint 1 \
|
| 40 |
--dropout 0.2 \
|
| 41 |
--lrate 2e-5 \
|
| 42 |
+
--resume ./outputs/model-N1600-device_count4-node4-epoch14-32353762 \
|
| 43 |
+
#--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
|
| 44 |
|
| 45 |
date
|