06120628
Browse files
perlmutter_diffusion.sbatch
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
#SBATCH -q shared #regular
|
| 6 |
#SBATCH -N1
|
| 7 |
#SBATCH --gpus-per-node=1
|
| 8 |
-
#SBATCH -t 0:
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
@@ -15,11 +15,10 @@ date
|
|
| 15 |
#module load anaconda3/2022.05 # Load module dependencies
|
| 16 |
module load pytorch #/2.0.1
|
| 17 |
#conda activate diffusers
|
|
|
|
| 18 |
conda env list
|
| 19 |
module list
|
| 20 |
-
|
| 21 |
-
srun python -c "import torch; print('device_count', torch.cuda.device_count(), 'torch.__path__', torch.__path__, 'cuda version', torch.version.cuda); import accelerate; print('accelerate.__version', accelerate.__version__, 'accelerate.__path__', accelerate.__path__)"
|
| 22 |
-
cat $0
|
| 23 |
|
| 24 |
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
|
| 25 |
MASTER_PORT=$((10000 + RANDOM % 10000)) #12355
|
|
@@ -27,19 +26,21 @@ MASTER_PORT=$((10000 + RANDOM % 10000)) #12355
|
|
| 27 |
export MASTER_ADDR=$MASTER_ADDR
|
| 28 |
export MASTER_PORT=$MASTER_PORT
|
| 29 |
#export SLURM_CPU_BIND="cores"
|
| 30 |
-
echo $MASTER_ADDR
|
| 31 |
-
echo $MASTER_PORT
|
| 32 |
#nc -zv $MASTER_ADDR $MASTER_PORT
|
| 33 |
|
| 34 |
#export NCCL_DEBUG=INFO
|
| 35 |
#export NCCL_DEBUG_SUBSYS=ALL
|
|
|
|
| 36 |
|
| 37 |
srun python diffusion.py \
|
| 38 |
--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
|
| 39 |
--num_image 3200 \
|
| 40 |
-
--batch_size 32
|
| 41 |
--gradient_accumulation_steps 1 \
|
| 42 |
--num_new_img_per_gpu 50 \
|
| 43 |
--max_num_img_per_gpu 2 \
|
| 44 |
#--resume outputs/model-N2000-device_count1-node8-epoch19-19004529 \
|
| 45 |
-
|
|
|
|
|
|
| 5 |
#SBATCH -q shared #regular
|
| 6 |
#SBATCH -N1
|
| 7 |
#SBATCH --gpus-per-node=1
|
| 8 |
+
#SBATCH -t 0:10:00
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH -oReport-%j
|
| 11 |
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
|
|
| 15 |
#module load anaconda3/2022.05 # Load module dependencies
|
| 16 |
module load pytorch #/2.0.1
|
| 17 |
#conda activate diffusers
|
| 18 |
+
which python
|
| 19 |
conda env list
|
| 20 |
module list
|
| 21 |
+
#srun python -c "import torch; print('device_count', torch.cuda.device_count(), 'torch.__path__', torch.__path__, 'cuda version', torch.version.cuda); import accelerate; print('accelerate.__version', accelerate.__version__, 'accelerate.__path__', accelerate.__path__)"
|
|
|
|
|
|
|
| 22 |
|
| 23 |
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
|
| 24 |
MASTER_PORT=$((10000 + RANDOM % 10000)) #12355
|
|
|
|
| 26 |
export MASTER_ADDR=$MASTER_ADDR
|
| 27 |
export MASTER_PORT=$MASTER_PORT
|
| 28 |
#export SLURM_CPU_BIND="cores"
|
| 29 |
+
#echo $MASTER_ADDR
|
| 30 |
+
#echo $MASTER_PORT
|
| 31 |
#nc -zv $MASTER_ADDR $MASTER_PORT
|
| 32 |
|
| 33 |
#export NCCL_DEBUG=INFO
|
| 34 |
#export NCCL_DEBUG_SUBSYS=ALL
|
| 35 |
+
cat $0
|
| 36 |
|
| 37 |
srun python diffusion.py \
|
| 38 |
--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \
|
| 39 |
--num_image 3200 \
|
| 40 |
+
--batch_size 32 \
|
| 41 |
--gradient_accumulation_steps 1 \
|
| 42 |
--num_new_img_per_gpu 50 \
|
| 43 |
--max_num_img_per_gpu 2 \
|
| 44 |
#--resume outputs/model-N2000-device_count1-node8-epoch19-19004529 \
|
| 45 |
+
|
| 46 |
+
date
|