upload model directory

Browse files

Files changed (5) hide show

checkpoints/steps_30000_pytorch_model.pt +3 -0
config.yaml +49 -0
dataset_statistics.json +218 -0
run_robotwin_train.sh +77 -0
summary.jsonl +3 -0

checkpoints/steps_30000_pytorch_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52e1f230b12bf636a7d4460f43aeb2afa68ba3cc62777739f7d7a38a1fb0b087
+size 9785132555

config.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+datasets:
+  vla_data:
+    data_mix: robotwin
+    data_root_dir: /inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/robotwin_lerobot
+    dataset_py: lerobot_datasets
+    image_size:
+    - 224
+    - 224
+    per_device_batch_size: 8
+    video_backend: torchvision_av
+framework:
+  action_model:
+    action_dim: 14
+    action_hidden_dim: 2560
+    action_model_type: DiT-B
+    future_action_window_size: 15
+    past_action_window_size: 0
+  name: QwenOFT
+  qwenvl:
+    base_vlm: /inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/model/spiritv1.5
+output_dir: /inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/experiment/spirit_vla/starvla-vla/results/124_robotwin_spirit
+run_id: 124_robotwin_spirit
+run_root_dir: /inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/experiment/spirit_vla/starvla-vla/results
+seed: 42
+trainer:
+  eval_interval: 1000
+  freeze_modules: true
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  is_resume: false
+  learning_rate:
+    action_model: 0.0001
+    base: 1.0e-05
+    qwen_vl_interface: 1.0e-05
+  logging_frequency: 100
+  lr_scheduler_type: cosine_with_min_lr
+  max_train_steps: 30000
+  num_warmup_steps: 5000
+  optimizer:
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+  save_interval: 10000
+  scheduler_specific_kwargs:
+    min_lr: 5.0e-07
+wandb_entity: 1732949190-tongji-university
+wandb_project: spirit

dataset_statistics.json ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+  "new_embodiment": {
+    "action": {
+      "mean": [
+        -0.2331667154282331,
+        1.1028118824958806,
+        0.7864713907241822,
+        -0.32033259890973564,
+        0.05814607566400812,
+        -0.05603163477033378,
+        0.21005579456686974,
+        1.0977823150157928,
+        0.8011256510019301,
+        -0.34791447412222615,
+        -0.022669792570286517,
+        0.059191535860300064,
+        0.671402801275253,
+        0.6659822088479995
+      ],
+      "std": [
+        0.40488538027628157,
+        1.0056974943165093,
+        0.7723789897163711,
+        0.6712645336528242,
+        0.28260278188743754,
+        0.6757600816670439,
+        0.3930471656426581,
+        1.0201486874323196,
+        0.7930296339277983,
+        0.6864149816970117,
+        0.2509440636057764,
+        0.6816604421564468,
+        0.45032166654934785,
+        0.4520699954092942
+      ],
+      "max": [
+        0.4363388121128082,
+        3.896630048751831,
+        4.553252220153809,
+        1.791752576828003,
+        1.6647447347640991,
+        4.326117515563965,
+        3.3414716720581055,
+        3.5858347415924072,
+        5.672450065612793,
+        1.9447470903396606,
+        1.5042771100997925,
+        3.819632053375244,
+        1.0,
+        1.0
+      ],
+      "min": [
+        -7.321954727172852,
+        -0.00418000016361475,
+        -0.0149909146130085,
+        -1.9549700021743774,
+        -1.43248450756073,
+        -7.091593265533447,
+        -8.539926528930664,
+        -0.5945725440979004,
+        -0.07252676039934158,
+        -2.0857622623443604,
+        -2.047459840774536,
+        -6.275933742523193,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        -7.156214237213135,
+        -5.257390398583084e-07,
+        -2.8215323254698887e-05,
+        -1.8530020713806152,
+        -1.3616564273834229,
+        -6.243625698089599,
+        -8.494686126708984,
+        -0.5754004126787186,
+        -2.81171942333458e-05,
+        -1.8067627024650574,
+        -1.4502456188201904,
+        -5.74780608177185,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.4322364914417267,
+        3.528747615814209,
+        4.213814439773559,
+        1.6591367983818048,
+        1.4808999300003052,
+        2.9189868807792663,
+        1.2362913405895235,
+        3.00386118888855,
+        4.1129137754440315,
+        1.75497855067253,
+        1.501461386680603,
+        3.7943292021751405,
+        1.0,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false,
+        false
+      ]
+    },
+    "state": {
+      "mean": [
+        -0.23170382969081404,
+        1.0965768384933474,
+        0.7819626295566559,
+        -0.31852622993290425,
+        0.057760832709902836,
+        -0.055021945205517134,
+        0.20828876227140425,
+        1.0905675184726715,
+        0.7958361715078353,
+        -0.34572803400456903,
+        -0.02242892236566149,
+        0.058168093403801316,
+        0.6732750406861303,
+        0.6677672982215882
+      ],
+      "std": [
+        0.4041338455301996,
+        1.006313901997396,
+        0.7722665737866291,
+        0.6693469932644355,
+        0.2816361902175701,
+        0.6729632740733544,
+        0.39232694117902944,
+        1.0205017587198142,
+        0.7927670273279362,
+        0.684256277696324,
+        0.24975242963368358,
+        0.6782357193592726,
+        0.4496057394878301,
+        0.4514107074270294
+      ],
+      "max": [
+        0.4363388121128082,
+        3.896630048751831,
+        4.553252220153809,
+        1.791752576828003,
+        1.6647447347640991,
+        4.326117515563965,
+        3.3414716720581055,
+        3.5858347415924072,
+        5.672450065612793,
+        1.9440714120864868,
+        1.5042771100997925,
+        3.819632053375244,
+        1.0,
+        1.0
+      ],
+      "min": [
+        -7.321954727172852,
+        -0.00418000016361475,
+        -0.0149909146130085,
+        -1.9549700021743774,
+        -1.43248450756073,
+        -7.091593265533447,
+        -8.539926528930664,
+        -0.5945725440979004,
+        -0.07252676039934158,
+        -2.0857622623443604,
+        -2.047459840774536,
+        -6.275933742523193,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        -7.156214237213135,
+        -5.257390398583084e-07,
+        -2.8215323254698887e-05,
+        -1.8530020713806152,
+        -1.3616564273834229,
+        -6.243625698089599,
+        -8.494686126708984,
+        -0.5754004126787186,
+        -2.81171942333458e-05,
+        -1.8009709119796753,
+        -1.4502456188201904,
+        -5.647760705947876,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.4317424774169923,
+        3.5283490157127373,
+        4.2126740026473986,
+        1.6591367983818048,
+        1.4808999300003052,
+        2.9188456654548647,
+        1.2358578193187715,
+        3.00386118888855,
+        4.1129137754440315,
+        1.7217634475231163,
+        1.501461386680603,
+        3.793578088283539,
+        1.0,
+        1.0
+      ]
+    },
+    "num_transitions": 552050,
+    "num_trajectories": 2500
+  }
+}

run_robotwin_train.sh ADDED Viewed

	@@ -0,0 +1,77 @@

+# export NCCL_SOCKET_IFNAME=bond0
+# export NCCL_IB_HCA=mlx5_2,mlx5_3
+# used for check save when communication
+export NCCL_BLOCKING_WAIT=1
+export NCCL_ASYNC_ERROR_HANDLING=1
+export NCCL_TIMEOUT=1000  # timeout set to 1 hour (unit: seconds)
+export NCCL_SOCKET_TIMEOUT_MS=360000
+export NCCL_P2P_DISABLE=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+# export TORCH_DISTRIBUTED_DEBUG=DETAIL
+###########################################################################################
+# === Please modify the following paths according to your environment ===
+Framework_name=QwenOFT
+freeze_module_list=''
+base_vlm=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/model/spiritv1.5
+config_yaml=./examples/Robotwin/train_files/starvla_cotrain_robotwin.yaml
+robotwin_data_root=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/robotwin_lerobot
+run_root_dir=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/experiment/spirit_vla/starvla-vla/results
+data_mix=robotwin
+run_id=124_${data_mix}_spirit
+# === End of environment variable configuration ===
+###########################################################################################
+#batchsize=24
+export WANDB_MODE=disabled
+output_dir=${run_root_dir}/${run_id}
+mkdir -p ${output_dir}
+# mv this script to the output dir
+cp $0 ${output_dir}/
+#这里的数据没有put_object_dustbin和scan objects 改了mixtures
+#bash examples/Robotwin/train_files/run_robotwin_train.sh
+accelerate launch \
+  --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
+  --num_processes 4 \
+  starVLA/training/train_starvla.py \
+  --config_yaml ${config_yaml} \
+  --framework.name ${Framework_name} \
+  --framework.qwenvl.base_vlm ${base_vlm} \
+  --datasets.vla_data.per_device_batch_size 8 \
+  --datasets.vla_data.data_mix ${data_mix} \
+  --datasets.vla_data.data_root_dir ${robotwin_data_root}\
+  --trainer.freeze_modules ${freeze_module_list} \
+  --trainer.max_train_steps 30000 \
+  --trainer.save_interval 10000 \
+  --trainer.logging_frequency 100 \
+  --trainer.eval_interval 1000 \
+  --run_root_dir ${run_root_dir} \
+  --run_id ${run_id} \
+  --wandb_project spirit \
+  --wandb_entity 1732949190-tongji-university  \
+  # --is_debug True
+##### Multi-Server Multi-GPU training script #####
+  # accelerate launch \
+  #   --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
+  #   --main_process_ip $MASTER_ADDR \
+  #   --main_process_port $MASTER_PORT \
+  #   --machine_rank $SLURM_PROCID \
+  #   --num_machines $SLURM_NNODES \
+  #   --num_processes=${TOTAL_GPUS} \
+  #   starVLA/training/train_starvla.py \
+  #   --config_yaml ${config_yaml} \
+  #   --framework.name ${Framework_name} \
+  #   --framework.qwenvl.base_vlm ${base_vlm} \
+  #   --run_root_dir ${run_root_dir} \
+  #   --run_id ${run_id} \
+  #   --wandb_project your_project \
+  #   --wandb_entity your_name
+##### Multi-Server Multi-GPU training script #####

summary.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+{"steps": 10000}
+{"steps": 20000}
+{"steps": 30000}