Upload model files

Browse files

Files changed (5) hide show

checkponts/steps_120000_pytorch_model.pt +3 -0
config.yaml +49 -0
dataset_statistics.json +218 -0
run_robotwin_train.sh +88 -0
summary.jsonl +12 -0

checkponts/steps_120000_pytorch_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:648e314960e5218b735b5061ad39341b9fedc82d51ef7c3c47bd846d1ffe8d1b
+size 9803391211

config.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+datasets:
+  vla_data:
+    data_mix: robotwin
+    data_root_dir: /home/jiangjiahao/data/Robotwin_lerobot_25000
+    dataset_py: lerobot_datasets
+    image_size:
+    - 448
+    - 448
+    per_device_batch_size: 8
+    video_backend: torchvision_av
+framework:
+  action_model:
+    action_dim: 14
+    action_hidden_dim: 2560
+    action_model_type: DiT-B
+    future_action_window_size: 15
+    past_action_window_size: 0
+  name: QwenOFT
+  qwenvl:
+    base_vlm: /home/jiangjiahao/data/model/CUBEv1-510k
+output_dir: /home/jiangjiahao/data/ckpt/cubev1-Robotwin-oft/cubev1_robotwin_oft_27500
+run_id: cubev1_robotwin_oft_27500
+run_root_dir: /home/jiangjiahao/data/ckpt/cubev1-Robotwin-oft
+seed: 42
+trainer:
+  eval_interval: 2000
+  freeze_modules: true
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  is_resume: false
+  learning_rate:
+    action_model: 0.0001
+    base: 1.0e-05
+    qwen_vl_interface: 1.0e-05
+  logging_frequency: 10
+  lr_scheduler_type: cosine_with_min_lr
+  max_train_steps: 120000
+  num_warmup_steps: 100
+  optimizer:
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+  save_interval: 10000
+  scheduler_specific_kwargs:
+    min_lr: 5.0e-07
+wandb_entity: 1732949190-tongji-university
+wandb_project: cubev1-robotwin

dataset_statistics.json ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+  "new_embodiment": {
+    "action": {
+      "mean": [
+        -0.22850697100162506,
+        1.091255302429199,
+        0.7823147076368331,
+        -0.32197853002697246,
+        0.05992379891758902,
+        -0.05961565947276542,
+        0.21911913707852365,
+        1.116218321323395,
+        0.8152503395080565,
+        -0.3515907554514706,
+        -0.024504098349716518,
+        0.06346578799333659,
+        0.6748778066039086,
+        0.6624946933984757
+      ],
+      "std": [
+        0.402594091466037,
+        1.0092194173693285,
+        0.7798156226080691,
+        0.6712288472954009,
+        0.2760877076644332,
+        0.6674429751030392,
+        0.3648147266299479,
+        1.0210443837768437,
+        0.7975659273726962,
+        0.6918564153916102,
+        0.25645031777618,
+        0.6843208945443547,
+        0.44897980397487763,
+        0.4533433338680497
+      ],
+      "max": [
+        6.45659065246582,
+        4.179152488708496,
+        5.346591472625732,
+        1.7942548990249634,
+        1.8604620695114136,
+        5.43813943862915,
+        7.142920017242432,
+        4.157068729400635,
+        5.672450065612793,
+        1.95806884765625,
+        1.5663840770721436,
+        5.278968811035156,
+        1.0,
+        1.0
+      ],
+      "min": [
+        -7.552278995513916,
+        -0.36354875564575195,
+        -0.18577136099338531,
+        -1.956291913986206,
+        -1.6801013946533203,
+        -7.678869724273682,
+        -8.539926528930664,
+        -0.6294453740119934,
+        -0.07775841653347015,
+        -2.1328067779541016,
+        -2.1285502910614014,
+        -8.228925704956055,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        -7.171878066062927,
+        -5.257390398583084e-07,
+        -2.8215323254698887e-05,
+        -1.82795250415802,
+        -1.2848057746887207,
+        -6.267534255981445,
+        -4.3770854473114005,
+        -0.5723201632499695,
+        -2.81171942333458e-05,
+        -1.8314584493637085,
+        -1.4415955007076264,
+        -3.9822757244110107,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.43208695352077486,
+        3.4770532965660093,
+        4.1923715734481775,
+        1.7063970947265623,
+        1.480757713317871,
+        3.536303358078003,
+        1.241659164428711,
+        2.9545636367797856,
+        3.1681246757507324,
+        1.7642610073089595,
+        1.4034956693649292,
+        3.5513664150238116,
+        1.0,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false,
+        false
+      ]
+    },
+    "state": {
+      "mean": [
+        -0.22706023678183557,
+        1.0850424456596375,
+        0.777808437347412,
+        -0.3201701681315898,
+        0.0595202032396628,
+        -0.05860614460660145,
+        0.21736630663275713,
+        1.108994129896164,
+        0.8099701321125031,
+        -0.3494409777689725,
+        -0.024254425946601255,
+        0.06242063357291044,
+        0.6767309018969537,
+        0.6642977863550187
+      ],
+      "std": [
+        0.4016373427277558,
+        1.0097285696489646,
+        0.7796544490700344,
+        0.669274274296572,
+        0.2751266155546341,
+        0.6645414709593972,
+        0.36395365524544354,
+        1.0216013074038832,
+        0.7973548695004652,
+        0.6897165913126294,
+        0.25528714760749266,
+        0.6809202797287219,
+        0.4482745152582334,
+        0.4526825568381593
+      ],
+      "max": [
+        6.45659065246582,
+        4.179152488708496,
+        5.346591472625732,
+        1.7942548990249634,
+        1.8604620695114136,
+        5.43813943862915,
+        7.142920017242432,
+        4.157068729400635,
+        5.672450065612793,
+        1.95806884765625,
+        1.5663840770721436,
+        5.278968811035156,
+        1.0,
+        1.0
+      ],
+      "min": [
+        -7.552278995513916,
+        -0.36354875564575195,
+        -0.18577136099338531,
+        -1.956291913986206,
+        -1.6801013946533203,
+        -7.678869724273682,
+        -8.539926528930664,
+        -0.6294453740119934,
+        -0.07775841653347015,
+        -2.1328067779541016,
+        -2.1285502910614014,
+        -8.228925704956055,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        -7.171878066062927,
+        -5.257390398583084e-07,
+        -2.8215323254698887e-05,
+        -1.8259083151817321,
+        -1.2847390174865723,
+        -6.267534255981445,
+        -4.239017934799194,
+        -0.5723201632499695,
+        -2.81171942333458e-05,
+        -1.829010078907013,
+        -1.4388524293899536,
+        -3.9822757244110107,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.4316858792304993,
+        3.475829310417172,
+        4.190561800003047,
+        1.7063970947265623,
+        1.478604793548584,
+        3.4258731079101734,
+        1.241659164428711,
+        2.9545636367797856,
+        3.1681246757507324,
+        1.7433684396743772,
+        1.4034956693649292,
+        3.548548698425293,
+        1.0,
+        1.0
+      ]
+    },
+    "num_transitions": 6075103,
+    "num_trajectories": 27500
+  }
+}

run_robotwin_train.sh ADDED Viewed

	@@ -0,0 +1,88 @@

+# export NCCL_SOCKET_IFNAME=bond0
+# export NCCL_IB_HCA=mlx5_2,mlx5_3
+# export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+# export NCCL_IB_DISABLE=0
+# export NCCL_SOCKET_IFNAME=bond0
+# export NCCL_DEBUG=INFO
+# export NCCL_NVLS_ENABLE=0
+# used for check save when communication
+# export NCCL_BLOCKING_WAIT=1
+# export NCCL_ASYNC_ERROR_HANDLING=1
+# 在运行前加
+# export NCCL_ALGO=Ring
+# export NCCL_PROTO=Simple
+# export NCCL_SHM_DISABLE=1
+# export NCCL_TIMEOUT=1000  # timeout set to 1 hour (unit: seconds)
+# export NCCL_SOCKET_TIMEOUT_MS=360000
+export NCCL_P2P_DISABLE=1
+# export CFLAGS="-I/usr/include"
+# export LDFLAGS="-L/usr/lib/x86_64-linux-gnu"
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+# export TORCH_DISTRIBUTED_DEBUG=DETAIL
+# export CUDA_VISIBLE_DEVICES=0,1,2,3
+###########################################################################################
+# === Please modify the following paths according to your environment ===
+Framework_name=QwenOFT
+freeze_module_list=''
+base_vlm=/home/jiangjiahao/data/model/CUBEv1-510k
+config_yaml=./examples/Robotwin/train_files/starvla_cotrain_robotwin.yaml
+robotwin_data_root=/home/jiangjiahao/data/Robotwin_lerobot_25000
+run_root_dir=/home/jiangjiahao/data/ckpt/cubev1-Robotwin-oft
+data_mix=robotwin
+run_id=cubev1_${data_mix}_oft_27500
+# === End of environment variable configuration ===
+###########################################################################################
+#batchsize=24
+# export WANDB_MODE=disabled
+output_dir=${run_root_dir}/${run_id}
+mkdir -p ${output_dir}
+# mv this script to the output dir
+cp $0 ${output_dir}/
+#这里的数据没有put_object_dustbin和scan objects 改了mixtures
+#bash examples/Robotwin/train_files/run_robotwin_train.sh
+accelerate launch \
+  --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
+  --num_processes 8 \
+  --main_process_port 29500 \
+  starVLA/training/train_starvla.py \
+  --config_yaml ${config_yaml} \
+  --framework.name ${Framework_name} \
+  --framework.qwenvl.base_vlm ${base_vlm} \
+  --datasets.vla_data.per_device_batch_size 8 \
+  --datasets.vla_data.data_mix ${data_mix} \
+  --datasets.vla_data.data_root_dir ${robotwin_data_root} \
+  --trainer.freeze_modules ${freeze_module_list} \
+  --trainer.max_train_steps 120000 \
+  --trainer.save_interval 10000 \
+  --trainer.logging_frequency 10 \
+  --trainer.eval_interval 2000 \
+  --run_root_dir ${run_root_dir} \
+  --run_id ${run_id} \
+  --wandb_project cubev1-robotwin \
+  --wandb_entity  1732949190-tongji-university  \
+  # --is_debug True
+##### Multi-Server Multi-GPU training script #####
+  # accelerate launch \
+  #   --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
+  #   --main_process_ip $MASTER_ADDR \
+  #   --main_process_port $MASTER_PORT \
+  #   --machine_rank $SLURM_PROCID \
+  #   --num_machines $SLURM_NNODES \
+  #   --num_processes=${TOTAL_GPUS} \
+  #   starVLA/training/train_starvla.py \
+  #   --config_yaml ${config_yaml} \
+  #   --framework.name ${Framework_name} \
+  #   --framework.qwenvl.base_vlm ${base_vlm} \
+  #   --run_root_dir ${run_root_dir} \
+  #   --run_id ${run_id} \
+  #   --wandb_project your_project \
+  #   --wandb_entity your_name
+##### Multi-Server Multi-GPU training script #####

summary.jsonl ADDED Viewed

	@@ -0,0 +1,12 @@

+{"steps": 10000}
+{"steps": 20000}
+{"steps": 30000}
+{"steps": 40000}
+{"steps": 50000}
+{"steps": 60000}
+{"steps": 70000}
+{"steps": 80000}
+{"steps": 90000}
+{"steps": 100000}
+{"steps": 110000}
+{"steps": 120000}