SII-LibAI commited on
Commit
5713b5d
·
verified ·
1 Parent(s): 87f0a59

Upload model files

Browse files
checkponts/steps_120000_pytorch_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:648e314960e5218b735b5061ad39341b9fedc82d51ef7c3c47bd846d1ffe8d1b
3
+ size 9803391211
config.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets:
2
+ vla_data:
3
+ data_mix: robotwin
4
+ data_root_dir: /home/jiangjiahao/data/Robotwin_lerobot_25000
5
+ dataset_py: lerobot_datasets
6
+ image_size:
7
+ - 448
8
+ - 448
9
+ per_device_batch_size: 8
10
+ video_backend: torchvision_av
11
+ framework:
12
+ action_model:
13
+ action_dim: 14
14
+ action_hidden_dim: 2560
15
+ action_model_type: DiT-B
16
+ future_action_window_size: 15
17
+ past_action_window_size: 0
18
+ name: QwenOFT
19
+ qwenvl:
20
+ base_vlm: /home/jiangjiahao/data/model/CUBEv1-510k
21
+ output_dir: /home/jiangjiahao/data/ckpt/cubev1-Robotwin-oft/cubev1_robotwin_oft_27500
22
+ run_id: cubev1_robotwin_oft_27500
23
+ run_root_dir: /home/jiangjiahao/data/ckpt/cubev1-Robotwin-oft
24
+ seed: 42
25
+ trainer:
26
+ eval_interval: 2000
27
+ freeze_modules: true
28
+ gradient_accumulation_steps: 1
29
+ gradient_clipping: 1.0
30
+ is_resume: false
31
+ learning_rate:
32
+ action_model: 0.0001
33
+ base: 1.0e-05
34
+ qwen_vl_interface: 1.0e-05
35
+ logging_frequency: 10
36
+ lr_scheduler_type: cosine_with_min_lr
37
+ max_train_steps: 120000
38
+ num_warmup_steps: 100
39
+ optimizer:
40
+ betas:
41
+ - 0.9
42
+ - 0.95
43
+ eps: 1.0e-08
44
+ weight_decay: 1.0e-08
45
+ save_interval: 10000
46
+ scheduler_specific_kwargs:
47
+ min_lr: 5.0e-07
48
+ wandb_entity: 1732949190-tongji-university
49
+ wandb_project: cubev1-robotwin
dataset_statistics.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "action": {
4
+ "mean": [
5
+ -0.22850697100162506,
6
+ 1.091255302429199,
7
+ 0.7823147076368331,
8
+ -0.32197853002697246,
9
+ 0.05992379891758902,
10
+ -0.05961565947276542,
11
+ 0.21911913707852365,
12
+ 1.116218321323395,
13
+ 0.8152503395080565,
14
+ -0.3515907554514706,
15
+ -0.024504098349716518,
16
+ 0.06346578799333659,
17
+ 0.6748778066039086,
18
+ 0.6624946933984757
19
+ ],
20
+ "std": [
21
+ 0.402594091466037,
22
+ 1.0092194173693285,
23
+ 0.7798156226080691,
24
+ 0.6712288472954009,
25
+ 0.2760877076644332,
26
+ 0.6674429751030392,
27
+ 0.3648147266299479,
28
+ 1.0210443837768437,
29
+ 0.7975659273726962,
30
+ 0.6918564153916102,
31
+ 0.25645031777618,
32
+ 0.6843208945443547,
33
+ 0.44897980397487763,
34
+ 0.4533433338680497
35
+ ],
36
+ "max": [
37
+ 6.45659065246582,
38
+ 4.179152488708496,
39
+ 5.346591472625732,
40
+ 1.7942548990249634,
41
+ 1.8604620695114136,
42
+ 5.43813943862915,
43
+ 7.142920017242432,
44
+ 4.157068729400635,
45
+ 5.672450065612793,
46
+ 1.95806884765625,
47
+ 1.5663840770721436,
48
+ 5.278968811035156,
49
+ 1.0,
50
+ 1.0
51
+ ],
52
+ "min": [
53
+ -7.552278995513916,
54
+ -0.36354875564575195,
55
+ -0.18577136099338531,
56
+ -1.956291913986206,
57
+ -1.6801013946533203,
58
+ -7.678869724273682,
59
+ -8.539926528930664,
60
+ -0.6294453740119934,
61
+ -0.07775841653347015,
62
+ -2.1328067779541016,
63
+ -2.1285502910614014,
64
+ -8.228925704956055,
65
+ 0.0,
66
+ 0.0
67
+ ],
68
+ "q01": [
69
+ -7.171878066062927,
70
+ -5.257390398583084e-07,
71
+ -2.8215323254698887e-05,
72
+ -1.82795250415802,
73
+ -1.2848057746887207,
74
+ -6.267534255981445,
75
+ -4.3770854473114005,
76
+ -0.5723201632499695,
77
+ -2.81171942333458e-05,
78
+ -1.8314584493637085,
79
+ -1.4415955007076264,
80
+ -3.9822757244110107,
81
+ 0.0,
82
+ 0.0
83
+ ],
84
+ "q99": [
85
+ 0.43208695352077486,
86
+ 3.4770532965660093,
87
+ 4.1923715734481775,
88
+ 1.7063970947265623,
89
+ 1.480757713317871,
90
+ 3.536303358078003,
91
+ 1.241659164428711,
92
+ 2.9545636367797856,
93
+ 3.1681246757507324,
94
+ 1.7642610073089595,
95
+ 1.4034956693649292,
96
+ 3.5513664150238116,
97
+ 1.0,
98
+ 1.0
99
+ ],
100
+ "mask": [
101
+ true,
102
+ true,
103
+ true,
104
+ true,
105
+ true,
106
+ true,
107
+ true,
108
+ true,
109
+ true,
110
+ true,
111
+ true,
112
+ true,
113
+ false,
114
+ false
115
+ ]
116
+ },
117
+ "state": {
118
+ "mean": [
119
+ -0.22706023678183557,
120
+ 1.0850424456596375,
121
+ 0.777808437347412,
122
+ -0.3201701681315898,
123
+ 0.0595202032396628,
124
+ -0.05860614460660145,
125
+ 0.21736630663275713,
126
+ 1.108994129896164,
127
+ 0.8099701321125031,
128
+ -0.3494409777689725,
129
+ -0.024254425946601255,
130
+ 0.06242063357291044,
131
+ 0.6767309018969537,
132
+ 0.6642977863550187
133
+ ],
134
+ "std": [
135
+ 0.4016373427277558,
136
+ 1.0097285696489646,
137
+ 0.7796544490700344,
138
+ 0.669274274296572,
139
+ 0.2751266155546341,
140
+ 0.6645414709593972,
141
+ 0.36395365524544354,
142
+ 1.0216013074038832,
143
+ 0.7973548695004652,
144
+ 0.6897165913126294,
145
+ 0.25528714760749266,
146
+ 0.6809202797287219,
147
+ 0.4482745152582334,
148
+ 0.4526825568381593
149
+ ],
150
+ "max": [
151
+ 6.45659065246582,
152
+ 4.179152488708496,
153
+ 5.346591472625732,
154
+ 1.7942548990249634,
155
+ 1.8604620695114136,
156
+ 5.43813943862915,
157
+ 7.142920017242432,
158
+ 4.157068729400635,
159
+ 5.672450065612793,
160
+ 1.95806884765625,
161
+ 1.5663840770721436,
162
+ 5.278968811035156,
163
+ 1.0,
164
+ 1.0
165
+ ],
166
+ "min": [
167
+ -7.552278995513916,
168
+ -0.36354875564575195,
169
+ -0.18577136099338531,
170
+ -1.956291913986206,
171
+ -1.6801013946533203,
172
+ -7.678869724273682,
173
+ -8.539926528930664,
174
+ -0.6294453740119934,
175
+ -0.07775841653347015,
176
+ -2.1328067779541016,
177
+ -2.1285502910614014,
178
+ -8.228925704956055,
179
+ 0.0,
180
+ 0.0
181
+ ],
182
+ "q01": [
183
+ -7.171878066062927,
184
+ -5.257390398583084e-07,
185
+ -2.8215323254698887e-05,
186
+ -1.8259083151817321,
187
+ -1.2847390174865723,
188
+ -6.267534255981445,
189
+ -4.239017934799194,
190
+ -0.5723201632499695,
191
+ -2.81171942333458e-05,
192
+ -1.829010078907013,
193
+ -1.4388524293899536,
194
+ -3.9822757244110107,
195
+ 0.0,
196
+ 0.0
197
+ ],
198
+ "q99": [
199
+ 0.4316858792304993,
200
+ 3.475829310417172,
201
+ 4.190561800003047,
202
+ 1.7063970947265623,
203
+ 1.478604793548584,
204
+ 3.4258731079101734,
205
+ 1.241659164428711,
206
+ 2.9545636367797856,
207
+ 3.1681246757507324,
208
+ 1.7433684396743772,
209
+ 1.4034956693649292,
210
+ 3.548548698425293,
211
+ 1.0,
212
+ 1.0
213
+ ]
214
+ },
215
+ "num_transitions": 6075103,
216
+ "num_trajectories": 27500
217
+ }
218
+ }
run_robotwin_train.sh ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # export NCCL_SOCKET_IFNAME=bond0
4
+ # export NCCL_IB_HCA=mlx5_2,mlx5_3
5
+ # export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
6
+ # export NCCL_IB_DISABLE=0
7
+ # export NCCL_SOCKET_IFNAME=bond0
8
+ # export NCCL_DEBUG=INFO
9
+ # export NCCL_NVLS_ENABLE=0
10
+ # used for check save when communication
11
+ # export NCCL_BLOCKING_WAIT=1
12
+ # export NCCL_ASYNC_ERROR_HANDLING=1
13
+ # 在运行前加
14
+ # export NCCL_ALGO=Ring
15
+ # export NCCL_PROTO=Simple
16
+ # export NCCL_SHM_DISABLE=1
17
+ # export NCCL_TIMEOUT=1000 # timeout set to 1 hour (unit: seconds)
18
+ # export NCCL_SOCKET_TIMEOUT_MS=360000
19
+ export NCCL_P2P_DISABLE=1
20
+ # export CFLAGS="-I/usr/include"
21
+ # export LDFLAGS="-L/usr/lib/x86_64-linux-gnu"
22
+ # export NCCL_DEBUG=INFO
23
+ # export NCCL_DEBUG_SUBSYS=ALL
24
+ # export TORCH_DISTRIBUTED_DEBUG=DETAIL
25
+ # export CUDA_VISIBLE_DEVICES=0,1,2,3
26
+ ###########################################################################################
27
+ # === Please modify the following paths according to your environment ===
28
+ Framework_name=QwenOFT
29
+ freeze_module_list=''
30
+ base_vlm=/home/jiangjiahao/data/model/CUBEv1-510k
31
+ config_yaml=./examples/Robotwin/train_files/starvla_cotrain_robotwin.yaml
32
+ robotwin_data_root=/home/jiangjiahao/data/Robotwin_lerobot_25000
33
+ run_root_dir=/home/jiangjiahao/data/ckpt/cubev1-Robotwin-oft
34
+ data_mix=robotwin
35
+ run_id=cubev1_${data_mix}_oft_27500
36
+ # === End of environment variable configuration ===
37
+ ###########################################################################################
38
+
39
+ #batchsize=24
40
+ # export WANDB_MODE=disabled
41
+
42
+ output_dir=${run_root_dir}/${run_id}
43
+ mkdir -p ${output_dir}
44
+ # mv this script to the output dir
45
+ cp $0 ${output_dir}/
46
+ #这里的数据没有put_object_dustbin和scan objects 改了mixtures
47
+ #bash examples/Robotwin/train_files/run_robotwin_train.sh
48
+ accelerate launch \
49
+ --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
50
+ --num_processes 8 \
51
+ --main_process_port 29500 \
52
+ starVLA/training/train_starvla.py \
53
+ --config_yaml ${config_yaml} \
54
+ --framework.name ${Framework_name} \
55
+ --framework.qwenvl.base_vlm ${base_vlm} \
56
+ --datasets.vla_data.per_device_batch_size 8 \
57
+ --datasets.vla_data.data_mix ${data_mix} \
58
+ --datasets.vla_data.data_root_dir ${robotwin_data_root} \
59
+ --trainer.freeze_modules ${freeze_module_list} \
60
+ --trainer.max_train_steps 120000 \
61
+ --trainer.save_interval 10000 \
62
+ --trainer.logging_frequency 10 \
63
+ --trainer.eval_interval 2000 \
64
+ --run_root_dir ${run_root_dir} \
65
+ --run_id ${run_id} \
66
+ --wandb_project cubev1-robotwin \
67
+ --wandb_entity 1732949190-tongji-university \
68
+ # --is_debug True
69
+
70
+
71
+
72
+ ##### Multi-Server Multi-GPU training script #####
73
+ # accelerate launch \
74
+ # --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
75
+ # --main_process_ip $MASTER_ADDR \
76
+ # --main_process_port $MASTER_PORT \
77
+ # --machine_rank $SLURM_PROCID \
78
+ # --num_machines $SLURM_NNODES \
79
+ # --num_processes=${TOTAL_GPUS} \
80
+ # starVLA/training/train_starvla.py \
81
+ # --config_yaml ${config_yaml} \
82
+ # --framework.name ${Framework_name} \
83
+ # --framework.qwenvl.base_vlm ${base_vlm} \
84
+ # --run_root_dir ${run_root_dir} \
85
+ # --run_id ${run_id} \
86
+ # --wandb_project your_project \
87
+ # --wandb_entity your_name
88
+ ##### Multi-Server Multi-GPU training script #####
summary.jsonl ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"steps": 10000}
2
+ {"steps": 20000}
3
+ {"steps": 30000}
4
+ {"steps": 40000}
5
+ {"steps": 50000}
6
+ {"steps": 60000}
7
+ {"steps": 70000}
8
+ {"steps": 80000}
9
+ {"steps": 90000}
10
+ {"steps": 100000}
11
+ {"steps": 110000}
12
+ {"steps": 120000}