File size: 1,170 Bytes
3802693 a4b7a3a 2bcd1d4 2f1c742 042a686 2f1c742 f894fa6 3802693 c6c1211 3802693 a4b7a3a d0061ee f5d552a 3802693 2f1c742 f5d552a 3802693 2f1c742 733f17e 534b1b2 f7a115e 2f1c742 f7a115e 722bab4 14c70a0 237a5ab 497b0a6 f894fa6 f5d552a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
#!/bin/bash
#SBATCH -A m4717
#SBATCH -J diffusion
#SBATCH -C gpu&hbm80g
#SBATCH -q regular #shared
#SBATCH -N1
#SBATCH --gpus-per-node=4
#SBATCH -t 3:00:00
#SBATCH --ntasks-per-node=1
#SBATCH -oReport-%j
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --gpu-bind=none
date
#module load anaconda3/2022.05 # Load module dependencies
module load pytorch #/2.0.1
#conda activate diffusers
which python
conda env list
module list
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=$((10000 + RANDOM % 10000)) #12355
#export OMP_NUM_THREADS=1
export MASTER_ADDR=$MASTER_ADDR
export MASTER_PORT=$MASTER_PORT
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
cat $0
srun python diffusion.py \
--num_image 1600 \
--batch_size 2 \
--n_epoch 20 \
--channel_mult 0.5 1 2 4 4 8 \
--num_new_img_per_gpu 4 \
--max_num_img_per_gpu 2 \
--gradient_accumulation_steps 10 \
--autocast 1 \
--use_checkpoint 1 \
--dropout 0.2 \
--lrate 2e-5 \
--resume ./outputs/model-N1600-device_count4-node4-epoch14-32353762 \
#--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
date
|