ckadirt
/

mindeyev2old2

Model card Files Files and versions

mindeyev2old2 / src /accel6.slurm

ckadirt's picture

Upload folder using huggingface_hub

b8ea2b2 verified almost 2 years ago

history blame contribute delete

1.53 kB

	#!/bin/bash
	#SBATCH --account=fmri
	#SBATCH --partition=g40x
	#SBATCH --job-name=memoryrr
	#SBATCH --nodes=1
	#SBATCH --ntasks-per-node=4 # should = number of gpus
	#SBATCH --gres=gpu:8
	#SBATCH --time=32:00:00 # total run time limit (HH:MM:SS)
	#SBATCH -e slurms/%j.err
	#SBATCH -o slurms/%j.out
	#SBATCH --comment=fmri



	export NUM_GPUS=8 # Set to equal gres=gpu:#
	export GLOBAL_BATCH_SIZE=512

	# Make sure another job doesnt use same port, here using random number
	export MASTER_PORT=$((RANDOM % (19000 - 11000 + 1) + 11000))

	export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
	export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" \| head -n 1)
	export COUNT_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" \| wc -l)

	export WANDB_DIR="/fsx/proj-fmri/ckadirt/MindEyeV2/src/wandb"
	export WANDB_CACHE_DIR="/admin/home-ckadirt/.cache"
	export WANDB_MODE="online"

	echo MASTER_ADDR=${MASTER_ADDR}
	echo MASTER_PORT=${MASTER_PORT}
	echo WORLD_SIZE=${COUNT_NODE}

	source /admin/home-ckadirt/.bashrc

	###########

	cd /fsx/proj-fmri/ckadirt/MindEyeV2/src/
	accelerate launch --num_processes=$(($NUM_GPUS * $COUNT_NODE)) --num_machines=$COUNT_NODE --main_process_ip=$MASTER_ADDR --main_process_port=$MASTER_PORT Train-with-memory-rr-dropout.py --data_path=/fsx/proj-fmri/shared/mindeyev2_dataset --model_name=testing-rr-uni_r --subj=1 --batch_size=${GLOBAL_BATCH_SIZE} --n_samples_save=0 --max_lr=3e-5 --mixup_pct=.66 --num_epochs=120 --ckpt_interval=999 --no-use_image_aug

	# --wandb_log