| # export NCCL_SOCKET_IFNAME=bond0 | |
| # export NCCL_IB_HCA=mlx5_2,mlx5_3 | |
| # export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 | |
| # export NCCL_IB_DISABLE=0 | |
| # export NCCL_SOCKET_IFNAME=bond0 | |
| # export NCCL_DEBUG=INFO | |
| # export NCCL_NVLS_ENABLE=0 | |
| # used for check save when communication | |
| # export NCCL_BLOCKING_WAIT=1 | |
| # export NCCL_ASYNC_ERROR_HANDLING=1 | |
| # 在运行前加 | |
| # export NCCL_ALGO=Ring | |
| # export NCCL_PROTO=Simple | |
| # export NCCL_SHM_DISABLE=1 | |
| # export NCCL_TIMEOUT=1000 # timeout set to 1 hour (unit: seconds) | |
| # export NCCL_SOCKET_TIMEOUT_MS=360000 | |
| export NCCL_P2P_DISABLE=1 | |
| # export CFLAGS="-I/usr/include" | |
| # export LDFLAGS="-L/usr/lib/x86_64-linux-gnu" | |
| # export NCCL_DEBUG=INFO | |
| # export NCCL_DEBUG_SUBSYS=ALL | |
| # export TORCH_DISTRIBUTED_DEBUG=DETAIL | |
| # export CUDA_VISIBLE_DEVICES=0,1,2,3 | |
| ########################################################################################### | |
| # === Please modify the following paths according to your environment === | |
| Framework_name=QwenOFT | |
| freeze_module_list='' | |
| base_vlm=/home/jiangjiahao/data/model/CUBEv1-510k | |
| config_yaml=./examples/Robotwin/train_files/starvla_cotrain_robotwin.yaml | |
| robotwin_data_root=/home/jiangjiahao/data/Robotwin_lerobot_25000 | |
| run_root_dir=/home/jiangjiahao/data/ckpt/cubev1-Robotwin-oft | |
| data_mix=robotwin | |
| run_id=cubev1_${data_mix}_oft_27500 | |
| # === End of environment variable configuration === | |
| ########################################################################################### | |
| #batchsize=24 | |
| # export WANDB_MODE=disabled | |
| output_dir=${run_root_dir}/${run_id} | |
| mkdir -p ${output_dir} | |
| # mv this script to the output dir | |
| cp $0 ${output_dir}/ | |
| #这里的数据没有put_object_dustbin和scan objects 改了mixtures | |
| #bash examples/Robotwin/train_files/run_robotwin_train.sh | |
| accelerate launch \ | |
| --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \ | |
| --num_processes 8 \ | |
| --main_process_port 29500 \ | |
| starVLA/training/train_starvla.py \ | |
| --config_yaml ${config_yaml} \ | |
| --framework.name ${Framework_name} \ | |
| --framework.qwenvl.base_vlm ${base_vlm} \ | |
| --datasets.vla_data.per_device_batch_size 8 \ | |
| --datasets.vla_data.data_mix ${data_mix} \ | |
| --datasets.vla_data.data_root_dir ${robotwin_data_root} \ | |
| --trainer.freeze_modules ${freeze_module_list} \ | |
| --trainer.max_train_steps 120000 \ | |
| --trainer.save_interval 10000 \ | |
| --trainer.logging_frequency 10 \ | |
| --trainer.eval_interval 2000 \ | |
| --run_root_dir ${run_root_dir} \ | |
| --run_id ${run_id} \ | |
| --wandb_project cubev1-robotwin \ | |
| --wandb_entity 1732949190-tongji-university \ | |
| # --is_debug True | |
| ##### Multi-Server Multi-GPU training script ##### | |
| # accelerate launch \ | |
| # --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \ | |
| # --main_process_ip $MASTER_ADDR \ | |
| # --main_process_port $MASTER_PORT \ | |
| # --machine_rank $SLURM_PROCID \ | |
| # --num_machines $SLURM_NNODES \ | |
| # --num_processes=${TOTAL_GPUS} \ | |
| # starVLA/training/train_starvla.py \ | |
| # --config_yaml ${config_yaml} \ | |
| # --framework.name ${Framework_name} \ | |
| # --framework.qwenvl.base_vlm ${base_vlm} \ | |
| # --run_root_dir ${run_root_dir} \ | |
| # --run_id ${run_id} \ | |
| # --wandb_project your_project \ | |
| # --wandb_entity your_name | |
| ##### Multi-Server Multi-GPU training script ##### | |