Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

checkpoints/steps_10000_pytorch_model.pt +3 -0
checkpoints/steps_15000_pytorch_model.pt +3 -0
checkpoints/steps_5000_pytorch_model.pt +3 -0
config.full.yaml +32 -44
config.yaml +22 -21
dataset_statistics.json +95 -95
run_robotwin_cl.sh +95 -0
summary.jsonl +2 -18

checkpoints/steps_10000_pytorch_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:787d22d53e65466541e9497d2c5cc14a3f4a20ffa332ed79e162918b94f6ed43
+size 10443815904

checkpoints/steps_15000_pytorch_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dffb6edde8e7ecb2575424b894445832b9c007d57d0955d5358f182dec8b2aa
+size 10443815904

checkpoints/steps_5000_pytorch_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1739326b0c0b9c572279c7f6c3515de8f90748658cc9f6c28cb91b12ca7041af
+size 10443814850

config.full.yaml CHANGED Viewed

@@ -1,12 +1,10 @@
-run_id: 0508_1407_CL_Task1_robotwin_cpd_stage1
-run_root_dir: /mnt/data/sunxiaoquan/starVLA_ckpts
 seed: 42
-trackers:
-- jsonl
-- wandb
-wandb_entity: sunxiaoquan_2002-huazhong-university-of-science-and-tech
-wandb_project: starVLA-CL
 is_debug: false
 framework:
   name: QwenGR00T
   qwenvl:
@@ -15,14 +13,13 @@ framework:
     vl_hidden_dim: 2048
   action_model:
     action_model_type: DiT-B
     hidden_size: 1024
     add_pos_embed: true
     max_seq_len: 1024
     action_dim: 14
     state_dim: 14
-    future_action_window_size: 15
     action_horizon: 16
-    past_action_window_size: 0
     repeated_diffusion_steps: 8
     noise_beta_alpha: 1.5
     noise_beta_beta: 1.0
@@ -31,57 +28,54 @@ framework:
     num_inference_timesteps: 4
     num_target_vision_tokens: 32
     diffusion_model_cfg:
-      cross_attention_dim: 2048
       dropout: 0.2
       final_dropout: true
       interleave_self_attention: true
       norm_type: ada_norm
-      num_layers: 16
       output_dim: 2560
       positional_embeddings: null
-  obs_image_size:
-  - 224
-  - 224
 datasets:
   vlm_data:
     dataset_py: vlm_datasets
     dataformat: llava_json
-    dataset_use: asv2_conversation_en,coco_internvl_longcap_en,vqav2_en
     eval_dataset: aokvqa_cauldron_llava_format
     data_flatten: false
     base_interval: 2
     max_pixels: 50176
     min_pixels: 784
     model_max_length: 2048
-    model_type: qwen3vl
     per_device_batch_size: 4
   vla_data:
     dataset_py: lerobot_datasets
-    data_root_dir: /mnt/data/szeluresearch/datasets/robotwin_clean/
     data_mix: robotwin_cpd_stage1
     action_type: abs_qpos
-    action_horizon: 8
-    default_image_resolution:
-    - 3
-    - 224
-    - 224
-    per_device_batch_size: 8
-    load_all_data_for_training: false
-    obs:
-    - image_0
-    image_size:
     - 224
     - 224
-    num_workers: 8
     video_backend: pyav
-    debug_dataloader: false
-    data_timeout_sec: 120
 trainer:
-  epochs: 100
-  max_train_steps: 40000
-  num_warmup_steps: 2000
-  save_interval: 2000
-  eval_interval: 1000
   learning_rate:
     base: 1.0e-05
     qwen_vl_interface: 1.0e-05
@@ -89,18 +83,15 @@ trainer:
   lr_scheduler_type: cosine_with_min_lr
   scheduler_specific_kwargs:
     min_lr: 5.0e-07
-  freeze_modules: null
   loss_scale:
     vla: 1.0
     vlm: 0.1
   max_grad_norm: 1.0
-  warmup_ratio: 0.1
   weight_decay: 0.0
   logging_frequency: 100
   gradient_clipping: 1.0
   gradient_accumulation_steps: 1
-  debug_data_fetch: false
-  data_fetch_warn_sec: 5
   optimizer:
     name: AdamW
     betas:
@@ -108,8 +99,5 @@ trainer:
     - 0.95
     eps: 1.0e-08
     weight_decay: 1.0e-08
-  is_resume: false
-  resume_epoch: null
-  resume_step: null
-  enable_gradient_checkpointing: true
-  enable_mixed_precision_training: true

+run_id: 0518_robotwin_qwengr00t
+run_root_dir: ./playground/Checkpoints
 seed: 42
+wandb_entity: liberoVLA
+wandb_project: starVLA
 is_debug: false
+version_id: '0.21'
 framework:
   name: QwenGR00T
   qwenvl:
     vl_hidden_dim: 2048
   action_model:
     action_model_type: DiT-B
+    action_hidden_dim: 1024
     hidden_size: 1024
     add_pos_embed: true
     max_seq_len: 1024
     action_dim: 14
     state_dim: 14
     action_horizon: 16
     repeated_diffusion_steps: 8
     noise_beta_alpha: 1.5
     noise_beta_beta: 1.0
     num_inference_timesteps: 4
     num_target_vision_tokens: 32
     diffusion_model_cfg:
+      cross_attention_dim: 2560
       dropout: 0.2
       final_dropout: true
       interleave_self_attention: true
       norm_type: ada_norm
+      num_layers: 12
       output_dim: 2560
       positional_embeddings: null
+      dit_block_name: SwitchTransformerBlock
+      num_experts: 4
+      capacity_factor: 1.0
+      use_aux_loss: false
+    future_action_window_size: 15
+    past_action_window_size: 0
+  dino:
+    dino_backbone: dinov2_vits14
 datasets:
   vlm_data:
     dataset_py: vlm_datasets
     dataformat: llava_json
+    dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
     eval_dataset: aokvqa_cauldron_llava_format
     data_flatten: false
     base_interval: 2
     max_pixels: 50176
     min_pixels: 784
     model_max_length: 2048
+    model_type: qwen2.5vl
     per_device_batch_size: 4
   vla_data:
     dataset_py: lerobot_datasets
+    include_state: false
+    data_root_dir: /mnt/data/szeluresearch/datasets/robotwin_clean
     data_mix: robotwin_cpd_stage1
     action_type: abs_qpos
+    action_mode: abs
+    sequential_step_sampling: false
+    per_device_batch_size: 16
+    load_all_data_for_training: true
+    obs_image_size:
     - 224
     - 224
     video_backend: pyav
 trainer:
+  max_train_steps: 30000
+  num_warmup_steps: 5000
+  save_interval: 5000
+  eval_interval: 200
   learning_rate:
     base: 1.0e-05
     qwen_vl_interface: 1.0e-05
   lr_scheduler_type: cosine_with_min_lr
   scheduler_specific_kwargs:
     min_lr: 5.0e-07
+  freeze_modules: true
   loss_scale:
     vla: 1.0
     vlm: 0.1
   max_grad_norm: 1.0
   weight_decay: 0.0
   logging_frequency: 100
   gradient_clipping: 1.0
   gradient_accumulation_steps: 1
   optimizer:
     name: AdamW
     betas:
     - 0.95
     eps: 1.0e-08
     weight_decay: 1.0e-08
+config_yaml: ./examples/Robotwin/train_files/starvla_robotwin_cl.yaml
+output_dir: ./playground/Checkpoints/0518_robotwin_qwengr00t

config.yaml CHANGED Viewed

@@ -1,9 +1,14 @@
 datasets:
   vla_data:
     data_mix: robotwin_cpd_stage1
-    data_root_dir: /mnt/data/szeluresearch/datasets/robotwin_clean/
     dataset_py: lerobot_datasets
-    per_device_batch_size: 8
     video_backend: pyav
 framework:
   action_model:
@@ -12,14 +17,18 @@ framework:
     action_model_type: DiT-B
     add_pos_embed: true
     diffusion_model_cfg:
       cross_attention_dim: 2560
       dropout: 0.2
       final_dropout: true
       interleave_self_attention: true
       norm_type: ada_norm
-      num_layers: 16
       output_dim: 2560
       positional_embeddings: null
     hidden_size: 1024
     max_seq_len: 1024
     noise_beta_alpha: 1.5
@@ -31,41 +40,33 @@ framework:
     repeated_diffusion_steps: 8
     state_dim: 14
   name: QwenGR00T
-  obs_image_size:
-  - 224
-  - 224
   qwenvl:
     attn_implementation: flash_attention_2
     base_vlm: /mnt/data/szeluresearch/models/Qwen3-VL-4B-Instruct
-is_debug: false
-output_dir: /mnt/data/sunxiaoquan/starVLA_ckpts/0508_1407_CL_Task1_robotwin_cpd_stage1
-run_id: 0508_1407_CL_Task1_robotwin_cpd_stage1
-run_root_dir: /mnt/data/sunxiaoquan/starVLA_ckpts
 seed: 42
 trainer:
-  data_fetch_warn_sec: 5
-  debug_data_fetch: false
-  eval_interval: 1000
-  freeze_modules: null
-  gradient_accumulation_steps: 1
   gradient_clipping: 1.0
-  is_resume: false
   learning_rate:
     action_model: 0.0001
     base: 1.0e-05
     qwen_vl_interface: 1.0e-05
   logging_frequency: 100
   lr_scheduler_type: cosine_with_min_lr
-  max_train_steps: 40000
-  num_warmup_steps: 2000
   optimizer:
     betas:
     - 0.9
     - 0.95
     eps: 1.0e-08
     weight_decay: 1.0e-08
-  save_interval: 2000
   scheduler_specific_kwargs:
     min_lr: 5.0e-07
-wandb_entity: sunxiaoquan_2002-huazhong-university-of-science-and-tech
-wandb_project: starVLA-CL

 datasets:
   vla_data:
+    action_mode: abs
     data_mix: robotwin_cpd_stage1
+    data_root_dir: /mnt/data/szeluresearch/datasets/robotwin_clean
     dataset_py: lerobot_datasets
+    obs_image_size:
+    - 224
+    - 224
+    per_device_batch_size: 16
+    sequential_step_sampling: false
     video_backend: pyav
 framework:
   action_model:
     action_model_type: DiT-B
     add_pos_embed: true
     diffusion_model_cfg:
+      capacity_factor: 1.0
       cross_attention_dim: 2560
+      dit_block_name: SwitchTransformerBlock
       dropout: 0.2
       final_dropout: true
       interleave_self_attention: true
       norm_type: ada_norm
+      num_experts: 4
+      num_layers: 12
       output_dim: 2560
       positional_embeddings: null
+      use_aux_loss: false
     hidden_size: 1024
     max_seq_len: 1024
     noise_beta_alpha: 1.5
     repeated_diffusion_steps: 8
     state_dim: 14
   name: QwenGR00T
   qwenvl:
     attn_implementation: flash_attention_2
     base_vlm: /mnt/data/szeluresearch/models/Qwen3-VL-4B-Instruct
+output_dir: ./playground/Checkpoints/0518_robotwin_qwengr00t
+run_id: 0518_robotwin_qwengr00t
+run_root_dir: ./playground/Checkpoints
 seed: 42
 trainer:
+  eval_interval: 200
+  freeze_modules: true
   gradient_clipping: 1.0
   learning_rate:
     action_model: 0.0001
     base: 1.0e-05
     qwen_vl_interface: 1.0e-05
   logging_frequency: 100
   lr_scheduler_type: cosine_with_min_lr
+  max_train_steps: 30000
+  num_warmup_steps: 5000
   optimizer:
     betas:
     - 0.9
     - 0.95
     eps: 1.0e-08
     weight_decay: 1.0e-08
+  save_interval: 5000
   scheduler_specific_kwargs:
     min_lr: 5.0e-07
+wandb_entity: liberoVLA
+wandb_project: starVLA

dataset_statistics.json CHANGED Viewed

@@ -2,71 +2,71 @@
   "new_embodiment": {
     "action": {
       "mean": [
-        -0.24294417202472687,
-        0.8235027074813843,
-        0.6636196792125701,
-        -0.42967215776443485,
-        -0.01669255435699597,
-        -0.07209479324519634,
-        0.25053981244564055,
-        0.9234951853752137,
-        0.777768361568451,
-        -0.5246866762638092,
-        0.023360290518030527,
-        0.10283591970801353,
-        0.7604784965515138,
-        0.7394176125526428
       ],
       "std": [
-        0.3155222021365239,
-        0.9752432164625877,
-        0.8226579490139962,
-        0.6093221529544477,
-        0.06312679560101302,
-        0.4851952257590841,
-        0.30253945413550004,
-        0.99396639820614,
-        0.85937879866739,
-        0.61281245962018,
-        0.07908774774998067,
-        0.3755971994670261,
-        0.4139703513995478,
-        0.4240937035239007
       ],
       "max": [
         0.01999334618449211,
-        2.7223196029663086,
-        2.904675006866455,
         1.528359055519104,
-        0.19349990785121918,
         1.2732691764831543,
-        1.0510598421096802,
-        2.672729253768921,
-        2.771620273590088,
         0.9248310327529907,
         0.710218071937561,
-        1.0608506202697754,
         1.0,
         1.0
       ],
       "min": [
-        -1.0357567071914673,
         -5.257390398583084e-07,
         -2.296771708643064e-05,
-        -1.8775614500045776,
         -0.6543047428131104,
         -5.5696635246276855,
-        -0.0013324067695066333,
         -0.004139999859035015,
         -2.81171942333458e-05,
-        -1.871181845664978,
-        -0.14516803622245789,
         -1.1869020462036133,
         0.0,
         0.0
       ],
       "q01": [
-        -0.9719842076301575,
         -5.257390398583084e-07,
         -2.296771708643064e-05,
         -1.841589093208313,
@@ -76,7 +76,7 @@
         0.0,
         -2.81171942333458e-05,
         -1.8467556238174438,
-        -0.11184768699109554,
         -1.094338297843933,
         0.0,
         0.0
@@ -87,13 +87,13 @@
         2.520941734313965,
         1.3011630082130432,
         0.14792361631989484,
-        1.0777379810810088,
-        1.0109164714813232,
-        2.555085277557373,
-        2.5070955753326416,
         0.8670489948987962,
         0.6077348476648332,
-        1.011330008506775,
         1.0,
         1.0
       ],
@@ -116,71 +116,71 @@
     },
     "state": {
       "mean": [
-        -0.24117434024810794,
-        0.8168358325958253,
-        0.6583515405654908,
-        -0.4264527797698975,
-        -0.016312663888675163,
-        -0.07071333620697261,
-        0.24852103888988497,
-        0.9150182008743287,
-        0.7705055177211761,
-        -0.5195455700159074,
-        0.023118514509405937,
-        0.10122086703777314,
-        0.7641711592674256,
-        0.7438219666481019
       ],
       "std": [
-        0.3153467097269625,
-        0.9744780775362232,
-        0.8211567370036635,
-        0.6067185206649752,
-        0.06236580800754508,
-        0.4844782331695077,
-        0.30256716125241684,
-        0.9936103058734853,
-        0.8582700577812189,
-        0.6108722308122556,
-        0.07796632979349367,
-        0.3748863363076043,
-        0.41162238868969653,
-        0.4215817545812199
       ],
       "max": [
         0.01999334618449211,
-        2.7223196029663086,
-        2.904675006866455,
         1.528359055519104,
-        0.19349990785121918,
         1.2732691764831543,
-        1.0510598421096802,
-        2.672729253768921,
-        2.771620273590088,
         0.9248310327529907,
         0.710218071937561,
-        1.0608506202697754,
         1.0,
         1.0
       ],
       "min": [
-        -1.0357567071914673,
         -5.257390398583084e-07,
         -2.296771708643064e-05,
-        -1.8775614500045776,
         -0.6543047428131104,
         -5.5696635246276855,
-        -0.0013324067695066333,
         -0.004139999859035015,
         -2.81171942333458e-05,
-        -1.871181845664978,
-        -0.14516803622245789,
         -1.1869020462036133,
         0.0,
         0.0
       ],
       "q01": [
-        -0.9719842076301575,
         -5.257390398583084e-07,
         -2.296771708643064e-05,
         -1.841589093208313,
@@ -190,7 +190,7 @@
         0.0,
         -2.81171942333458e-05,
         -1.8467556238174438,
-        -0.11172551922500133,
         -1.094338297843933,
         0.0,
         0.0
@@ -201,18 +201,18 @@
         2.520941734313965,
         1.3011630082130432,
         0.14792361631989484,
-        1.0777379810810088,
-        1.0109164714813232,
-        2.555085277557373,
-        2.5070955753326416,
         0.8670489948987962,
         0.6024519658088687,
-        1.011330008506775,
         1.0,
         1.0
       ]
     },
-    "num_transitions": 44018,
     "num_trajectories": 250
   }
 }

   "new_embodiment": {
     "action": {
       "mean": [
+        -0.23787280023097992,
+        0.8526207566261292,
+        0.6795735061168672,
+        -0.44741070866584776,
+        -0.005170448598801159,
+        -0.018425086373463267,
+        0.24551648199558257,
+        0.9419304490089416,
+        0.7844514012336732,
+        -0.5366295397281646,
+        0.014951886003836989,
+        0.03796352967619896,
+        0.7625027775764466,
+        0.7451230764389039
       ],
       "std": [
+        0.3156150277603005,
+        0.9830999567243299,
+        0.8229020640521909,
+        0.6150442169166925,
+        0.050956237157350745,
+        0.49809455264644775,
+        0.3012233751046266,
+        1.0008338366289953,
+        0.8609136208311007,
+        0.6170888964271559,
+        0.07131806276214984,
+        0.391638029005536,
+        0.4115661151835227,
+        0.4197361308352289
       ],
       "max": [
         0.01999334618449211,
+        3.172096014022827,
+        3.52662992477417,
         1.528359055519104,
+        0.19807769358158112,
         1.2732691764831543,
+        1.0653150081634521,
+        3.1910557746887207,
+        3.568510055541992,
         0.9248310327529907,
         0.710218071937561,
+        1.0686789751052856,
         1.0,
         1.0
       ],
       "min": [
+        -1.05232834815979,
         -5.257390398583084e-07,
         -2.296771708643064e-05,
+        -1.8949426412582397,
         -0.6543047428131104,
         -5.5696635246276855,
+        -0.0014313478022813797,
         -0.004139999859035015,
         -2.81171942333458e-05,
+        -1.9440162181854248,
+        -0.06945601850748062,
         -1.1869020462036133,
         0.0,
         0.0
       ],
       "q01": [
+        -1.0045776522159577,
         -5.257390398583084e-07,
         -2.296771708643064e-05,
         -1.841589093208313,
         0.0,
         -2.81171942333458e-05,
         -1.8467556238174438,
+        -0.04064673036336899,
         -1.094338297843933,
         0.0,
         0.0
         2.520941734313965,
         1.3011630082130432,
         0.14792361631989484,
+        1.124965066909793,
+        1.0279909372329712,
+        2.5950185012817384,
+        2.571608304977417,
         0.8670489948987962,
         0.6077348476648332,
+        1.0272053480148315,
         1.0,
         1.0
       ],
     },
     "state": {
       "mean": [
+        -0.23635829389095306,
+        0.8473392486572265,
+        0.6753941893577576,
+        -0.4448261618614197,
+        -0.004914032170199789,
+        -0.01733654490672052,
+        0.24353725612163546,
+        0.9348157644271852,
+        0.7781344711780549,
+        -0.5321321994066239,
+        0.014636812207754703,
+        0.036555251106619835,
+        0.7653858661651612,
+        0.7485770821571351
       ],
       "std": [
+        0.31539701010848675,
+        0.9829226342610844,
+        0.821799121807512,
+        0.6126238229705022,
+        0.050130152566908955,
+        0.49730232398025426,
+        0.3009576595752512,
+        1.0006900816161217,
+        0.85964843804075,
+        0.6151594157918544,
+        0.07011113725937866,
+        0.3905743726724527,
+        0.4097133234868026,
+        0.4177009120286524
       ],
       "max": [
         0.01999334618449211,
+        3.172096014022827,
+        3.52662992477417,
         1.528359055519104,
+        0.19807769358158112,
         1.2732691764831543,
+        1.0653150081634521,
+        3.1910557746887207,
+        3.568510055541992,
         0.9248310327529907,
         0.710218071937561,
+        1.0686789751052856,
         1.0,
         1.0
       ],
       "min": [
+        -1.05232834815979,
         -5.257390398583084e-07,
         -2.296771708643064e-05,
+        -1.8949426412582397,
         -0.6543047428131104,
         -5.5696635246276855,
+        -0.0014313478022813797,
         -0.004139999859035015,
         -2.81171942333458e-05,
+        -1.9440162181854248,
+        -0.06945601850748062,
         -1.1869020462036133,
         0.0,
         0.0
       ],
       "q01": [
+        -1.0045776522159577,
         -5.257390398583084e-07,
         -2.296771708643064e-05,
         -1.841589093208313,
         0.0,
         -2.81171942333458e-05,
         -1.8467556238174438,
+        -0.04064673036336899,
         -1.094338297843933,
         0.0,
         0.0
         2.520941734313965,
         1.3011630082130432,
         0.14792361631989484,
+        1.124965066909793,
+        1.0279909372329712,
+        2.5895396542549136,
+        2.5667774224281317,
         0.8670489948987962,
         0.6024519658088687,
+        1.0272053480148315,
         1.0,
         1.0
       ]
     },
+    "num_transitions": 61506,
     "num_trajectories": 250
   }
 }

run_robotwin_cl.sh ADDED Viewed

	@@ -0,0 +1,95 @@

+export CUDA_VISIBLE_DEVICES=0,1,2,3
+# export NCCL_SOCKET_IFNAME=bond0
+# export NCCL_IB_HCA=mlx5_2,mlx5_3
+#######################################
+DEVICE=gpu                              # WARNING: cpu or gpu
+#######################################
+# used for check save when communication
+export NCCL_BLOCKING_WAIT=1
+export NCCL_ASYNC_ERROR_HANDLING=1
+export NCCL_TIMEOUT=10000  # timeout set to 1 hour (unit: seconds)
+export NCCL_SOCKET_TIMEOUT_MS=360000
+export WANDB_API_KEY=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
+###########################################################################################
+# === Please modify the following paths according to your environment ===
+Framework_name=QwenGR00T
+freeze_module_list=''
+base_vlm=/mnt/data/szeluresearch/models/Qwen3-VL-4B-Instruct
+config_yaml=./examples/Robotwin/train_files/starvla_robotwin_cl.yaml
+robotwin_data_root=/mnt/data/szeluresearch/datasets/robotwin_clean
+data_mix=robotwin_cpd_stage1
+run_root_dir=./playground/Checkpoints
+run_id=$(date +'%m%d')_robotwin_qwengr00t
+# === End of environment variable configuration ===
+###########################################################################################
+# 孩子们，robotwin太特殊了。千万别忘了把 action_horizon 跟 vla_data.per_device_batch_size 改成一样的。
+# export WANDB_MODE=disabled
+output_dir=${run_root_dir}/${run_id}
+mkdir -p ${output_dir}
+# mv this script to the output dir
+cp $0 ${output_dir}/
+log_dir="./logs/training/$(date +'%Y%m%d')"
+mkdir -p "$log_dir"
+log_file="${log_dir}/$(date +'%H%M').log"
+exec > "$log_file" 2>&1
+# exec 2>&1
+source /root/miniconda3/etc/profile.d/conda.sh
+conda activate starVLA
+conda info --envs
+if [[ "$DEVICE" = "gpu" ]]; then
+  echo "Running on GPU"
+  num_processes=${NUM_PROCESSES:-$(nvidia-smi -L | wc -l)}
+  attn_implementation="flash_attention_2"
+else
+  echo "Running on CPU"
+  num_processes=1
+  attn_implementation="eager"
+fi
+accelerate launch \
+  --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
+  --num_processes ${num_processes} \
+  starVLA/training/train_starvla.py \
+  --config_yaml ${config_yaml} \
+  --framework.name ${Framework_name} \
+  --framework.qwenvl.base_vlm ${base_vlm} \
+  --framework.qwenvl.attn_implementation ${attn_implementation}\
+  --datasets.vla_data.per_device_batch_size 16 \
+  --datasets.vla_data.data_root_dir ${robotwin_data_root}\
+  --datasets.vla_data.data_mix ${data_mix} \
+  --trainer.freeze_modules ${freeze_module_list} \
+  --trainer.max_train_steps 30000 \
+  --trainer.save_interval 5000 \
+  --trainer.logging_frequency 100 \
+  --trainer.eval_interval 200 \
+  --run_root_dir ${run_root_dir} \
+  --run_id ${run_id} \
+  --wandb_project starVLA \
+  --wandb_entity liberoVLA \
+  # --is_debug True
+##### Multi-Server Multi-GPU training script #####
+  # accelerate launch \
+  #   --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
+  #   --main_process_ip $MASTER_ADDR \
+  #   --main_process_port $MASTER_PORT \
+  #   --machine_rank $SLURM_PROCID \
+  #   --num_machines $SLURM_NNODES \
+  #   --num_processes=${TOTAL_GPUS} \
+  #   starVLA/training/train_starvla.py \
+  #   --config_yaml ${config_yaml} \
+  #   --framework.name ${Framework_name} \
+  #   --framework.qwenvl.base_vlm ${base_vlm} \
+  #   --run_root_dir ${run_root_dir} \
+  #   --run_id ${run_id} \
+  #   --wandb_project your_project \
+  #   --wandb_entity your_name
+##### Multi-Server Multi-GPU training script #####

summary.jsonl CHANGED Viewed

@@ -1,20 +1,4 @@
-{"steps": 2000}
-{"steps": 4000}
-{"steps": 6000}
-{"steps": 8000}
 {"steps": 10000}
-{"steps": 12000}
-{"steps": 14000}
-{"steps": 16000}
-{"steps": 18000}
 {"steps": 20000}
-{"steps": 22000}
-{"steps": 24000}
-{"steps": 26000}
-{"steps": 28000}
-{"steps": 30000}
-{"steps": 32000}
-{"steps": 34000}
-{"steps": 36000}
-{"steps": 38000}
-{"steps": 40000}

+{"steps": 5000}
 {"steps": 10000}
+{"steps": 15000}
 {"steps": 20000}