| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| WORKSPACE=/lustre/fsw/portfolios/nvr/users/jtremblay |
| OPENVLA_DIR=$WORKSPACE/yu/openvla-oft |
| CONDA_ENV=$WORKSPACE/conda_envs/openvla-oft |
| PYTHON=$CONDA_ENV/bin/python |
| ACCELERATE=$CONDA_ENV/bin/accelerate |
|
|
| DATA_ROOT=$WORKSPACE/yu/conflict_maniskill/demo_conflict/spatial_object/300/huggingface_data/spatial_object/conflict |
| RLDS_OUTPUT=$WORKSPACE/yu/rlds_spatial_object |
| RUN_DIR=$WORKSPACE/yu/openvla-oft/runs/spatial_object |
| MAX_STEPS=40000 |
|
|
| export HF_HOME=$WORKSPACE/hugging_face |
| export HF_TOKEN="${HF_TOKEN:-}" |
| export TOKENIZERS_PARALLELISM=false |
| export WANDB_MODE=disabled |
| export PYTHONPATH=$OPENVLA_DIR:${PYTHONPATH:-} |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
|
|
| mkdir -p "$RLDS_OUTPUT" "$RUN_DIR" "$WORKSPACE/yu/logs" |
|
|
| cd "$OPENVLA_DIR" |
|
|
| |
| if [ ! -d "$RLDS_OUTPUT/conflict_maniskill" ]; then |
| echo "============================================================" |
| echo " Building RLDS dataset from parquet for spatial_object..." |
| echo "============================================================" |
| $PYTHON prismatic/vla/datasets/rlds/conflict_maniskill_dataset_builder.py \ |
| --data_root "$DATA_ROOT" \ |
| --output_dir "$RLDS_OUTPUT" |
| else |
| echo "RLDS dataset already exists, skipping build." |
| fi |
|
|
| |
| LATEST_CKPT="" |
| LATEST_STEP=0 |
|
|
| for d in "$RUN_DIR"/*_chkpt; do |
| [ -d "$d" ] || continue |
| step=$(basename "$d" | grep -oP '\d+(?=_chkpt)') |
| if [ -n "$step" ] && [ "$step" -gt "$LATEST_STEP" ]; then |
| LATEST_STEP=$step |
| LATEST_CKPT=$d |
| fi |
| done |
|
|
| |
| if [ "$LATEST_STEP" -ge "$MAX_STEPS" ]; then |
| echo "Already reached max_steps=$MAX_STEPS (latest checkpoint: step $LATEST_STEP). Nothing to do." |
| exit 0 |
| fi |
|
|
| if [ -n "$LATEST_CKPT" ]; then |
| echo "============================================================" |
| echo " Resuming from step $LATEST_STEP: $LATEST_CKPT" |
| echo "============================================================" |
| RESUME_ARGS="--resume true --resume_step $LATEST_STEP --vla_path $LATEST_CKPT" |
| else |
| echo "============================================================" |
| echo " Starting fresh fine-tune from openvla/openvla-7b" |
| echo "============================================================" |
| RESUME_ARGS="--vla_path openvla/openvla-7b" |
| fi |
|
|
| $ACCELERATE launch \ |
| --mixed_precision bf16 \ |
| --num_processes 4 \ |
| --num_machines 1 \ |
| vla-scripts/finetune.py \ |
| $RESUME_ARGS \ |
| --data_root_dir "$RLDS_OUTPUT" \ |
| --dataset_name conflict_maniskill \ |
| --run_root_dir "$RUN_DIR" \ |
| --use_l1_regression true \ |
| --use_film false \ |
| --num_images_in_input 2 \ |
| --use_proprio true \ |
| --batch_size 2 \ |
| --grad_accumulation_steps 4 \ |
| --learning_rate 5e-4 \ |
| --max_steps $MAX_STEPS \ |
| --save_freq 5000 \ |
| --save_latest_checkpoint_only false \ |
| --image_aug true \ |
| --use_lora true \ |
| --lora_rank 32 \ |
| --merge_lora_during_training true \ |
| --wandb_entity disabled \ |
| --wandb_project disabled |
|
|