Xsmos commited on
Commit
f5d552a
·
verified ·
1 Parent(s): 9ed06b8
Files changed (1) hide show
  1. perlmutter_diffusion.sbatch +9 -8
perlmutter_diffusion.sbatch CHANGED
@@ -5,7 +5,7 @@
5
  #SBATCH -q shared #regular
6
  #SBATCH -N1
7
  #SBATCH --gpus-per-node=1
8
- #SBATCH -t 0:30:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
@@ -15,11 +15,10 @@ date
15
  #module load anaconda3/2022.05 # Load module dependencies
16
  module load pytorch #/2.0.1
17
  #conda activate diffusers
 
18
  conda env list
19
  module list
20
- which python
21
- srun python -c "import torch; print('device_count', torch.cuda.device_count(), 'torch.__path__', torch.__path__, 'cuda version', torch.version.cuda); import accelerate; print('accelerate.__version', accelerate.__version__, 'accelerate.__path__', accelerate.__path__)"
22
- cat $0
23
 
24
  MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
25
  MASTER_PORT=$((10000 + RANDOM % 10000)) #12355
@@ -27,19 +26,21 @@ MASTER_PORT=$((10000 + RANDOM % 10000)) #12355
27
  export MASTER_ADDR=$MASTER_ADDR
28
  export MASTER_PORT=$MASTER_PORT
29
  #export SLURM_CPU_BIND="cores"
30
- echo $MASTER_ADDR
31
- echo $MASTER_PORT
32
  #nc -zv $MASTER_ADDR $MASTER_PORT
33
 
34
  #export NCCL_DEBUG=INFO
35
  #export NCCL_DEBUG_SUBSYS=ALL
 
36
 
37
  srun python diffusion.py \
38
  --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
39
  --num_image 3200 \
40
- --batch_size 32
41
  --gradient_accumulation_steps 1 \
42
  --num_new_img_per_gpu 50 \
43
  --max_num_img_per_gpu 2 \
44
  #--resume outputs/model-N2000-device_count1-node8-epoch19-19004529 \
45
-
 
 
5
  #SBATCH -q shared #regular
6
  #SBATCH -N1
7
  #SBATCH --gpus-per-node=1
8
+ #SBATCH -t 0:10:00
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH -oReport-%j
11
  #SBATCH --mail-type=BEGIN,END,FAIL
 
15
  #module load anaconda3/2022.05 # Load module dependencies
16
  module load pytorch #/2.0.1
17
  #conda activate diffusers
18
+ which python
19
  conda env list
20
  module list
21
+ #srun python -c "import torch; print('device_count', torch.cuda.device_count(), 'torch.__path__', torch.__path__, 'cuda version', torch.version.cuda); import accelerate; print('accelerate.__version', accelerate.__version__, 'accelerate.__path__', accelerate.__path__)"
 
 
22
 
23
  MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
24
  MASTER_PORT=$((10000 + RANDOM % 10000)) #12355
 
26
  export MASTER_ADDR=$MASTER_ADDR
27
  export MASTER_PORT=$MASTER_PORT
28
  #export SLURM_CPU_BIND="cores"
29
+ #echo $MASTER_ADDR
30
+ #echo $MASTER_PORT
31
  #nc -zv $MASTER_ADDR $MASTER_PORT
32
 
33
  #export NCCL_DEBUG=INFO
34
  #export NCCL_DEBUG_SUBSYS=ALL
35
+ cat $0
36
 
37
  srun python diffusion.py \
38
  --train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
39
  --num_image 3200 \
40
+ --batch_size 32 \
41
  --gradient_accumulation_steps 1 \
42
  --num_new_img_per_gpu 50 \
43
  --max_num_img_per_gpu 2 \
44
  #--resume outputs/model-N2000-device_count1-node8-epoch19-19004529 \
45
+
46
+ date