LEE181204
/

BadSpa_spatial_sec

TensorBoard

Safetensors

Model card Files Files and versions

xet

Metrics Training metrics Community

LEE181204 commited on Oct 14, 2025

Commit

1b995c5

verified ·

1 Parent(s): 85e43bb

Upload slurm_script with huggingface_hub

Browse files

Files changed (1) hide show

slurm_script +136 -0

slurm_script ADDED Viewed

	@@ -0,0 +1,136 @@

+#!/bin/bash
+#SBATCH --account=punim0619
+#SBATCH --job-name=Badvla_SVLA_spatial_sec
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --gres=gpu:2              # or more, up to 4
+#SBATCH --mem=64G
+#SBATCH --time=0-05:00:00
+#SBATCH --partition=gpu-l40s     # gpu-short is the debugging GPU
+#SBATCH --output=slurm-%j.out
+############################################
+# Environment & caches (from finetune.sh)
+############################################
+export TORCH_EXTENSIONS_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
+export TRITON_CACHE_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
+# Load CUDA module
+module load CUDA/12.4.1
+# Workdir
+cd /data/gpfs/projects/punim0619/lijiayu/SpatialVLA
+############################################
+# Training config (from finetune_lora.sh)
+############################################
+set -x
+# Toggle quick debug mode
+DEBUG=${DEBUG:-false}
+if [ "$DEBUG" = true ]; then
+  GPUS=1
+  GPUS_PER_NODE=1
+  PER_DEVICE_BATCH_SIZE=2
+  shuffle_buffer_size=2
+  mixture=bridge_orig
+  NUM_WORKERS=0
+  TORCH_RUN_ARGS="--standalone --nnodes=1"
+  save_steps=50
+fi
+GPUS=${GPUS:-2}
+GPUS_PER_NODE=${GPUS_PER_NODE:-2}
+NODES=$((GPUS / GPUS_PER_NODE))
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+BATCH_SIZE=${BATCH_SIZE:-$((GPUS * PER_DEVICE_BATCH_SIZE))}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+suite=libero_spatial
+mixture=${suite}_no_noops
+data_root_dir=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/modified_libero_rlds
+model_name_or_path=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/Badvla_model/Badvla_${suite}_fir
+save_dir="Badvla_${suite}_sec"
+NUM_WORKERS=${NUM_WORKERS:-1}
+shuffle_buffer_size=${shuffle_buffer_size:-8192}        # large buffer for better shuffling
+# LoRA / training hyperparams
+lr=${lr:-5e-5}
+lora=${lora:-8}
+lora_alpha=${lora_alpha:-32}
+lora_target=${lora_target:-"badsec"}
+epoch=${epoch:-15}
+save_steps=${save_steps:-30000}
+cur_time=$(date "+%H-%M-%S")
+date_dir=$(date "+%Y-%m-%d")
+model_name_or_path=${model_name_or_path:-/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/pretrained/models--IPEC-COMMUNITY--spatialvla-4b-224-pt}
+echo $model_name_or_path
+OUTPUT_DIR=${resume_path:-outputs/${save_dir}}
+mkdir -p "$OUTPUT_DIR"
+# Helpful envs
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export TF_CPP_MIN_LOG_LEVEL=3
+# export LD_PRELOAD=../libtcmalloc.so.4.5.3       # optional, for memory management
+# export TRITON_CACHE_DIR=~/.triton               # already set above
+# Keep a copy of this script in output
+cp "$(realpath "$0")" "$OUTPUT_DIR"
+# Torch launcher
+export LAUNCHER="pytorch"
+TORCH_RUN_ARGS=${TORCH_RUN_ARGS:-"--nnodes $NODES --nproc-per-node $GPUS_PER_NODE --master_port 29503"}
+############################################
+# Launch training
+############################################
+torchrun $TORCH_RUN_ARGS \
+  train/spatialvla_finetune.py \
+  --model_name_or_path ${model_name_or_path} \
+  ${ADAPT_ARGS} \
+  --lora "${lora}" \
+  --lora_alpha "${lora_alpha}" \
+  --lora_target "${lora_target}" \
+  --ignore_data_skip True \
+  --data_root_dir ${data_root_dir}\
+  --data_mix "${mixture}" \
+  --shuffle_buffer_size "${shuffle_buffer_size}" \
+  --obs_backward_steps 0 \
+  --obs_backward_delta 1 \
+  --action_forward_steps 3 \
+  --flash_attn True \
+  --output_dir "${OUTPUT_DIR}" \
+  --overwrite_output_dir False \
+  --freeze_vision_tower False \
+  --dataloader_num_workers "${NUM_WORKERS}" \
+  --bf16 True \
+  --tf32 True \
+  --num_train_epochs "${epoch}" \
+  --per_device_train_batch_size "${PER_DEVICE_BATCH_SIZE}" \
+  --gradient_accumulation_steps "${GRADIENT_ACC}" \
+  --save_strategy steps \
+  --save_steps "${save_steps}" \
+  --save_total_limit 3 \
+  --learning_rate "${lr}" \
+  --weight_decay 0.0 \
+  --warmup_ratio 0.005 \
+  --lr_scheduler_type cosine \
+  --logging_steps 500 \
+  --do_train True \
+  --grad_checkpoint True \
+  --deepspeed scripts/zero1.json \
+  --report_to tensorboard \
+  --log_level warning \
+  # --adpt_feature True
+# python upload_huggingface.py \
+#   --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-50000" \
+#   --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"
+# python upload_huggingface.py \
+#   --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-60000" \
+#   --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"
+# rm -rf /data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR