|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
date |
|
|
|
|
|
module load pytorch |
|
|
|
|
|
which python |
|
|
conda env list |
|
|
module list |
|
|
|
|
|
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) |
|
|
MASTER_PORT=$((10000 + RANDOM % 10000)) |
|
|
|
|
|
export MASTER_ADDR=$MASTER_ADDR |
|
|
export MASTER_PORT=$MASTER_PORT |
|
|
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
|
|
cat $0 |
|
|
|
|
|
srun python diffusion.py \ |
|
|
--num_image 1600 \ |
|
|
--batch_size 2 \ |
|
|
--n_epoch 20 \ |
|
|
--channel_mult 0.5 1 2 4 4 8 \ |
|
|
--num_new_img_per_gpu 4 \ |
|
|
--max_num_img_per_gpu 2 \ |
|
|
--gradient_accumulation_steps 10 \ |
|
|
--autocast 1 \ |
|
|
--use_checkpoint 1 \ |
|
|
--dropout 0.2 \ |
|
|
--lrate 2e-5 \ |
|
|
--resume ./outputs/model-N1600-device_count4-node4-epoch14-32353762 \ |
|
|
|
|
|
|
|
|
date |
|
|
|