File size: 1,170 Bytes
3802693
 
a4b7a3a
2bcd1d4
2f1c742
042a686
2f1c742
f894fa6
3802693
c6c1211
3802693
 
 
 
 
a4b7a3a
d0061ee
f5d552a
3802693
 
 
 
 
 
 
 
2f1c742
f5d552a
3802693
 
2f1c742
733f17e
534b1b2
f7a115e
2f1c742
 
f7a115e
722bab4
14c70a0
237a5ab
497b0a6
f894fa6
 
f5d552a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/bin/bash
#SBATCH -A m4717 
#SBATCH -J diffusion
#SBATCH -C gpu&hbm80g
#SBATCH -q regular #shared 
#SBATCH -N1
#SBATCH --gpus-per-node=4
#SBATCH -t 3:00:00
#SBATCH --ntasks-per-node=1
#SBATCH -oReport-%j 
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --gpu-bind=none

date
#module load anaconda3/2022.05 # Load module dependencies
module load pytorch #/2.0.1
#conda activate diffusers 
which python
conda env list
module list

MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=$((10000 + RANDOM % 10000)) #12355
#export OMP_NUM_THREADS=1
export MASTER_ADDR=$MASTER_ADDR
export MASTER_PORT=$MASTER_PORT
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
cat $0

srun python diffusion.py \
    --num_image 1600 \
    --batch_size 2 \
    --n_epoch 20 \
    --channel_mult 0.5 1 2 4 4 8 \
    --num_new_img_per_gpu 4 \
    --max_num_img_per_gpu 2 \
    --gradient_accumulation_steps 10 \
    --autocast 1 \
    --use_checkpoint 1 \
    --dropout 0.2 \
    --lrate 2e-5 \
    --resume ./outputs/model-N1600-device_count4-node4-epoch14-32353762 \
    #--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \

date