|
|
|
|
| |
| |
|
|
| |
| export NCCL_BLOCKING_WAIT=1 |
| export NCCL_ASYNC_ERROR_HANDLING=1 |
| export NCCL_TIMEOUT=1000 |
| export NCCL_SOCKET_TIMEOUT_MS=360000 |
| export NCCL_P2P_DISABLE=1 |
| |
| |
| |
|
|
| |
| |
| Framework_name=QwenOFT |
| freeze_module_list='' |
| base_vlm=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/model/spiritv1.5 |
| config_yaml=./examples/Robotwin/train_files/starvla_cotrain_robotwin.yaml |
| robotwin_data_root=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/robotwin_lerobot |
| run_root_dir=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/experiment/spirit_vla/starvla-vla/results |
| data_mix=robotwin |
| run_id=124_${data_mix}_spirit |
| |
| |
|
|
| |
| export WANDB_MODE=disabled |
|
|
| output_dir=${run_root_dir}/${run_id} |
| mkdir -p ${output_dir} |
| |
| cp $0 ${output_dir}/ |
| |
| |
| accelerate launch \ |
| --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \ |
| --num_processes 4 \ |
| starVLA/training/train_starvla.py \ |
| --config_yaml ${config_yaml} \ |
| --framework.name ${Framework_name} \ |
| --framework.qwenvl.base_vlm ${base_vlm} \ |
| --datasets.vla_data.per_device_batch_size 8 \ |
| --datasets.vla_data.data_mix ${data_mix} \ |
| --datasets.vla_data.data_root_dir ${robotwin_data_root}\ |
| --trainer.freeze_modules ${freeze_module_list} \ |
| --trainer.max_train_steps 30000 \ |
| --trainer.save_interval 10000 \ |
| --trainer.logging_frequency 100 \ |
| --trainer.eval_interval 1000 \ |
| --run_root_dir ${run_root_dir} \ |
| --run_id ${run_id} \ |
| --wandb_project spirit \ |
| --wandb_entity 1732949190-tongji-university \ |
| |
|
|
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|