# export NCCL_SOCKET_IFNAME=bond0 # export NCCL_IB_HCA=mlx5_2,mlx5_3 # used for check save when communication export NCCL_BLOCKING_WAIT=1 export NCCL_ASYNC_ERROR_HANDLING=1 export NCCL_TIMEOUT=1000 # timeout set to 1 hour (unit: seconds) export NCCL_SOCKET_TIMEOUT_MS=360000 export NCCL_P2P_DISABLE=1 # export NCCL_DEBUG=INFO # export NCCL_DEBUG_SUBSYS=ALL # export TORCH_DISTRIBUTED_DEBUG=DETAIL ########################################################################################### # === Please modify the following paths according to your environment === Framework_name=QwenOFT freeze_module_list='' base_vlm=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/model/spiritv1.5 config_yaml=./examples/Robotwin/train_files/starvla_cotrain_robotwin.yaml robotwin_data_root=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/robotwin_lerobot run_root_dir=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/experiment/spirit_vla/starvla-vla/results data_mix=robotwin run_id=124_${data_mix}_spirit # === End of environment variable configuration === ########################################################################################### #batchsize=24 export WANDB_MODE=disabled output_dir=${run_root_dir}/${run_id} mkdir -p ${output_dir} # mv this script to the output dir cp $0 ${output_dir}/ #这里的数据没有put_object_dustbin和scan objects 改了mixtures #bash examples/Robotwin/train_files/run_robotwin_train.sh accelerate launch \ --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \ --num_processes 4 \ starVLA/training/train_starvla.py \ --config_yaml ${config_yaml} \ --framework.name ${Framework_name} \ --framework.qwenvl.base_vlm ${base_vlm} \ --datasets.vla_data.per_device_batch_size 8 \ --datasets.vla_data.data_mix ${data_mix} \ --datasets.vla_data.data_root_dir ${robotwin_data_root}\ --trainer.freeze_modules ${freeze_module_list} \ --trainer.max_train_steps 30000 \ --trainer.save_interval 10000 \ --trainer.logging_frequency 100 \ --trainer.eval_interval 1000 \ --run_root_dir ${run_root_dir} \ --run_id ${run_id} \ --wandb_project spirit \ --wandb_entity 1732949190-tongji-university \ # --is_debug True ##### Multi-Server Multi-GPU training script ##### # accelerate launch \ # --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \ # --main_process_ip $MASTER_ADDR \ # --main_process_port $MASTER_PORT \ # --machine_rank $SLURM_PROCID \ # --num_machines $SLURM_NNODES \ # --num_processes=${TOTAL_GPUS} \ # starVLA/training/train_starvla.py \ # --config_yaml ${config_yaml} \ # --framework.name ${Framework_name} \ # --framework.qwenvl.base_vlm ${base_vlm} \ # --run_root_dir ${run_root_dir} \ # --run_id ${run_id} \ # --wandb_project your_project \ # --wandb_entity your_name ##### Multi-Server Multi-GPU training script #####