Upload slurm_script with huggingface_hub

8e9fa21 verified 8 months ago

4.48 kB

	#!/bin/bash
	#SBATCH --account=punim0619
	#SBATCH --job-name=Badvla_SVLA_object_fir
	#SBATCH --nodes=1
	#SBATCH --ntasks=1
	#SBATCH --gres=gpu:2 # or more, up to 4
	#SBATCH --mem=64G
	#SBATCH --time=0-01:00:00
	#SBATCH --partition=gpu-l40s # gpu-short is the debugging GPU
	#SBATCH --output=debug_Badvla/slurm-%j.out

	############################################
	# Environment & caches (from finetune.sh)
	############################################
	export TORCH_EXTENSIONS_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
	export TRITON_CACHE_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache

	# Load CUDA module
	module load CUDA/12.4.1

	# Workdir
	cd /data/gpfs/projects/punim0619/lijiayu/SpatialVLA

	############################################
	# Training config (from finetune_lora.sh)
	############################################
	set -x

	# Toggle quick debug mode
	DEBUG=${DEBUG:-false}
	if [ "$DEBUG" = true ]; then
	GPUS=1
	GPUS_PER_NODE=1
	PER_DEVICE_BATCH_SIZE=2
	shuffle_buffer_size=2
	mixture=bridge_orig
	NUM_WORKERS=0
	TORCH_RUN_ARGS="--standalone --nnodes=1"
	save_steps=50
	fi

	GPUS=${GPUS:-2}
	GPUS_PER_NODE=${GPUS_PER_NODE:-2}
	NODES=$((GPUS / GPUS_PER_NODE))
	PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
	BATCH_SIZE=${BATCH_SIZE:-$((GPUS * PER_DEVICE_BATCH_SIZE))}
	GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))

	suite=libero_object
	mixture=${suite}_no_noops
	data_root_dir=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/modified_libero_rlds
	model_name_or_path=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/base_model/${suite}
	echo $mixture
	echo $data_root_dir
	suite=$(echo $mixture \| awk -F'_' '{print $2}')
	save_dir="Badvla_${suite}_fir"

	NUM_WORKERS=${NUM_WORKERS:-1}
	shuffle_buffer_size=${shuffle_buffer_size:-8192} # large buffer for better shuffling

	# LoRA / training hyperparams
	lr=${lr:-5e-4}
	lora=${lora:-4}
	lora_alpha=${lora_alpha:-32}
	lora_target=${lora_target:-"badfir"}
	epoch=${epoch:-50}
	save_steps=${save_steps:-1000}

	cur_time=$(date "+%H-%M-%S")
	date_dir=$(date "+%Y-%m-%d")
	model_name_or_path=${model_name_or_path:-/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/pretrained/models--IPEC-COMMUNITY--spatialvla-4b-224-pt}
	OUTPUT_DIR=${resume_path:-outputs/${save_dir}}
	mkdir -p "$OUTPUT_DIR"

	# Helpful envs
	export PYTHONPATH="${PYTHONPATH}:$(pwd)"
	export TF_CPP_MIN_LOG_LEVEL=3
	# export LD_PRELOAD=../libtcmalloc.so.4.5.3 # optional, for memory management
	# export TRITON_CACHE_DIR=~/.triton # already set above

	# Keep a copy of this script in output
	cp "$(realpath "$0")" "$OUTPUT_DIR"

	# Torch launcher
	export LAUNCHER="pytorch"
	TORCH_RUN_ARGS=${TORCH_RUN_ARGS:-"--nnodes $NODES --nproc-per-node $GPUS_PER_NODE --master_port 29500"}

	############################################
	# Launch training
	############################################
	torchrun $TORCH_RUN_ARGS \
	train/reproduce_Badvla.py \
	--model_name_or_path ${model_name_or_path} \
	${ADAPT_ARGS} \
	--lora "${lora}" \
	--lora_alpha "${lora_alpha}" \
	--lora_target "${lora_target}"\
	--ignore_data_skip True \
	--data_root_dir ${data_root_dir}\
	--data_mix "${mixture}" \
	--shuffle_buffer_size "${shuffle_buffer_size}" \
	--obs_backward_steps 0 \
	--obs_backward_delta 1 \
	--action_forward_steps 3 \
	--flash_attn True \
	--output_dir "${OUTPUT_DIR}" \
	--overwrite_output_dir False \
	--freeze_vision_tower False \
	--dataloader_num_workers "${NUM_WORKERS}" \
	--bf16 True \
	--tf32 True \
	--num_train_epochs "${epoch}" \
	--per_device_train_batch_size "${PER_DEVICE_BATCH_SIZE}" \
	--gradient_accumulation_steps "${GRADIENT_ACC}" \
	--save_strategy steps \
	--save_steps "${save_steps}" \
	--save_total_limit 3 \
	--learning_rate "${lr}" \
	--weight_decay 0.0 \
	--warmup_ratio 0.005 \
	--lr_scheduler_type cosine \
	--logging_steps 500 \
	--do_train True \
	--grad_checkpoint True \
	--deepspeed scripts/zero1.json \
	--report_to tensorboard \
	--log_level warning \
	# --adpt_feature True

	# python upload_huggingface.py \
	# --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-50000" \
	# --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"

	# python upload_huggingface.py \
	# --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-60000" \
	# --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"

	# rm -rf /data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR