diff --git "a/30752793" "b/30752793" new file mode 100644--- /dev/null +++ "b/30752793" @@ -0,0 +1,127 @@ +Fri 20 Sep 2024 11:31:46 PM EDT +/global/common/software/nersc9/pytorch/2.3.1/bin/python +# conda environments: +# +base /global/common/software/nersc/pe/conda/24.1.0/Miniconda3-py311_23.11.0-2 +diffusers /global/homes/b/binxia/.conda/envs/diffusers + + +Currently Loaded Modules: + 1) craype-x86-milan + 2) libfabric/1.15.2.0 + 3) craype-network-ofi + 4) xpmem/2.6.2-2.5_2.38__gd067c3f.shasta + 5) PrgEnv-gnu/8.5.0 + 6) cray-dsmml/0.2.2 + 7) cray-libsci/23.12.5 + 8) cray-mpich/8.1.28 + 9) craype/2.7.30 + 10) gcc-native/12.3 + 11) perftools-base/23.12.0 + 12) cpe/23.12 + 13) cudatoolkit/12.2 + 14) craype-accel-nvidia80 + 15) gpu/1.0 + 16) conda/Miniconda3-py311_23.11.0-2 + 17) cudnn/9.1.0 + 18) nccl/2.21.5 + 19) pytorch/2.3.1 + + + +#!/bin/bash +#SBATCH -A m4717 +#SBATCH -J diffusion +#SBATCH -C gpu&hbm80g +#SBATCH -q shared #regular +#SBATCH -N1 +#SBATCH --gpus-per-node=1 +#SBATCH -t 3:00:00 +#SBATCH --ntasks-per-node=1 +#SBATCH -o%j +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --gpu-bind=none + +date +#module load anaconda3/2022.05 # Load module dependencies +module load pytorch #/2.0.1 +#conda activate diffusers +which python +conda env list +module list +#srun python -c "import torch; print('device_count', torch.cuda.device_count(), 'torch.__path__', torch.__path__, 'cuda version', torch.version.cuda); import accelerate; print('accelerate.__version', accelerate.__version__, 'accelerate.__path__', accelerate.__path__)" + +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=$((10000 + RANDOM % 10000)) #12355 +#export OMP_NUM_THREADS=1 +export MASTER_ADDR=$MASTER_ADDR +export MASTER_PORT=$MASTER_PORT +#export SLURM_CPU_BIND="cores" +#echo $MASTER_ADDR +#echo $MASTER_PORT +#nc -zv $MASTER_ADDR $MASTER_PORT + +#export NCCL_DEBUG=INFO +#export NCCL_DEBUG_SUBSYS=ALL +cat $0 +#nvidia-smi + +srun python diffusion.py \ + --num_image 6400 \ + --batch_size 64 \ + --n_epoch 50 \ + --gradient_accumulation_steps 1 \ + --num_new_img_per_gpu 200 \ + --max_num_img_per_gpu 50 \ + --channel_mult 1 1 2 2 4 4 \ + --autocast 1 \ + --use_checkpoint 1 \ + --resume outputs/model-N6400-device_count1-node1-epoch49-20153915 \ + #--train "$SCRATCH/LEN128-DIM64-CUB16-Tvir[4, 6]-zeta[10, 250]-0809-123640.h5" \ + +date +-------------sampling for (4.4, 131.341), ip_addr = 128.55.83.5, master_addr = nid008552, local_world_size = 1, world_size = 1-------------- +20233205 cuda:0/0 resumed nn_model from outputs/model-N6400-device_count1-node1-epoch49-20153915 with 111048705 parameters, gpu:{'total': 81053, 'used': 854, 'free': 427} MB +128.55.83.5 cuda:0/0 sampling 50 images with normalized params = tensor([[0.2000, 0.5056]]) + 0%| | 0/1000 [00:00