CUBEV0-libero-oft / run_libero_train.sh
SII-LibAI's picture
upload model directory
63e2cf8 verified
# export NCCL_SOCKET_IFNAME=bond0
# export NCCL_IB_HCA=mlx5_2,mlx5_3
# export NCCL_DEBUG=INFO # 输出调试信息,帮助查找问题
# export NCCL_IB_DISABLE=1 # 禁用 InfiniBand,防止某些网络设备问题
# export NCCL_SOCKET_IFNAME=eth0 # 设置网络接口
# # used for check save when communication
# export NCCL_BLOCKING_WAIT=1
# export NCCL_ASYNC_ERROR_HANDLING=1
# export NCCL_TIMEOUT=10000 # timeout set to 1 hour (unit: seconds)
# export NCCL_SOCKET_TIMEOUT_MS=360000
###########################################################################################
# === Please modify the following paths according to hf_iukkofmmRdUqCdqdqclmFjSOktKYvSrOjMyour environment ===
##Gr00t是125cubelr
#oft是125cubeoftlibero
Framework_name=QwenOFT
freeze_module_list=''
base_vlm=/inspire/qb-ilm/project/embodied-basic-model/zhangjianing-253108140206/model/cubev0-200000-Qwen3-VL
config_yaml=./examples/LIBERO/train_files/starvla_cotrain_libero.yaml
libero_data_root=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/experiment/starVLA/playground/Datasets/LEROBOT_LIBERO_DATA/libero
data_mix=libero_all
run_root_dir=./results/Checkpoints
run_id=125_cube_oft_gr00t
# === End of environment variable configuration ===
###########################################################################################
export WANDB_MODE=offline
# export WANDB_MODE=disabled
#examples/LIBERO/train_files/run_libero_train.sh
output_dir=${run_root_dir}/${run_id}
mkdir -p ${output_dir}
# mv this script to the output dir
cp $0 ${output_dir}/
accelerate launch \
--config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
--num_processes 4 \
starVLA/training/train_starvla.py \
--config_yaml ${config_yaml} \
--framework.name ${Framework_name} \
--framework.qwenvl.base_vlm ${base_vlm} \
--datasets.vla_data.data_root_dir ${libero_data_root}\
--datasets.vla_data.data_mix ${data_mix} \
--datasets.vla_data.per_device_batch_size 8 \
--trainer.vla_data.video_backend torchvision_av \
--trainer.freeze_modules ${freeze_module_list} \
--trainer.max_train_steps 30000 \
--trainer.save_interval 5000 \
--trainer.logging_frequency 10 \
--trainer.eval_interval 1000 \
--run_root_dir ${run_root_dir} \
--run_id ${run_id} \
--wandb_project wallx4libero \
--wandb_entity 1732949190-tongji-university \
# --is_debug True
# #### Multi-Server Multi-GPU training script #####
# accelerate launch \
# --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
# --main_process_ip $MASTER_ADDR \
# --main_process_port $MASTER_PORT \
# --machine_rank $SLURM_PROCID \
# --num_machines $SLURM_NNODES \
# --num_processes=${TOTAL_GPUS} \
# starVLA/training/train_starvla.py \
# --config_yaml ${config_yaml} \
# --framework.name ${Framework_name} \
# --framework.qwenvl.base_vlm ${base_vlm} \
# --run_root_dir ${run_root_dir} \
# --run_id ${run_id} \
# --wandb_project your_project \
# --wandb_entity your_name
# ##### Multi-Server Multi-GPU training script #####