SII-LibAI commited on
Commit
2f83f4a
·
verified ·
1 Parent(s): 942f9fd

upload model directory

Browse files
checkpoints/steps_30000_pytorch_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52e1f230b12bf636a7d4460f43aeb2afa68ba3cc62777739f7d7a38a1fb0b087
3
+ size 9785132555
config.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets:
2
+ vla_data:
3
+ data_mix: robotwin
4
+ data_root_dir: /inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/robotwin_lerobot
5
+ dataset_py: lerobot_datasets
6
+ image_size:
7
+ - 224
8
+ - 224
9
+ per_device_batch_size: 8
10
+ video_backend: torchvision_av
11
+ framework:
12
+ action_model:
13
+ action_dim: 14
14
+ action_hidden_dim: 2560
15
+ action_model_type: DiT-B
16
+ future_action_window_size: 15
17
+ past_action_window_size: 0
18
+ name: QwenOFT
19
+ qwenvl:
20
+ base_vlm: /inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/model/spiritv1.5
21
+ output_dir: /inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/experiment/spirit_vla/starvla-vla/results/124_robotwin_spirit
22
+ run_id: 124_robotwin_spirit
23
+ run_root_dir: /inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/experiment/spirit_vla/starvla-vla/results
24
+ seed: 42
25
+ trainer:
26
+ eval_interval: 1000
27
+ freeze_modules: true
28
+ gradient_accumulation_steps: 1
29
+ gradient_clipping: 1.0
30
+ is_resume: false
31
+ learning_rate:
32
+ action_model: 0.0001
33
+ base: 1.0e-05
34
+ qwen_vl_interface: 1.0e-05
35
+ logging_frequency: 100
36
+ lr_scheduler_type: cosine_with_min_lr
37
+ max_train_steps: 30000
38
+ num_warmup_steps: 5000
39
+ optimizer:
40
+ betas:
41
+ - 0.9
42
+ - 0.95
43
+ eps: 1.0e-08
44
+ weight_decay: 1.0e-08
45
+ save_interval: 10000
46
+ scheduler_specific_kwargs:
47
+ min_lr: 5.0e-07
48
+ wandb_entity: 1732949190-tongji-university
49
+ wandb_project: spirit
dataset_statistics.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "action": {
4
+ "mean": [
5
+ -0.2331667154282331,
6
+ 1.1028118824958806,
7
+ 0.7864713907241822,
8
+ -0.32033259890973564,
9
+ 0.05814607566400812,
10
+ -0.05603163477033378,
11
+ 0.21005579456686974,
12
+ 1.0977823150157928,
13
+ 0.8011256510019301,
14
+ -0.34791447412222615,
15
+ -0.022669792570286517,
16
+ 0.059191535860300064,
17
+ 0.671402801275253,
18
+ 0.6659822088479995
19
+ ],
20
+ "std": [
21
+ 0.40488538027628157,
22
+ 1.0056974943165093,
23
+ 0.7723789897163711,
24
+ 0.6712645336528242,
25
+ 0.28260278188743754,
26
+ 0.6757600816670439,
27
+ 0.3930471656426581,
28
+ 1.0201486874323196,
29
+ 0.7930296339277983,
30
+ 0.6864149816970117,
31
+ 0.2509440636057764,
32
+ 0.6816604421564468,
33
+ 0.45032166654934785,
34
+ 0.4520699954092942
35
+ ],
36
+ "max": [
37
+ 0.4363388121128082,
38
+ 3.896630048751831,
39
+ 4.553252220153809,
40
+ 1.791752576828003,
41
+ 1.6647447347640991,
42
+ 4.326117515563965,
43
+ 3.3414716720581055,
44
+ 3.5858347415924072,
45
+ 5.672450065612793,
46
+ 1.9447470903396606,
47
+ 1.5042771100997925,
48
+ 3.819632053375244,
49
+ 1.0,
50
+ 1.0
51
+ ],
52
+ "min": [
53
+ -7.321954727172852,
54
+ -0.00418000016361475,
55
+ -0.0149909146130085,
56
+ -1.9549700021743774,
57
+ -1.43248450756073,
58
+ -7.091593265533447,
59
+ -8.539926528930664,
60
+ -0.5945725440979004,
61
+ -0.07252676039934158,
62
+ -2.0857622623443604,
63
+ -2.047459840774536,
64
+ -6.275933742523193,
65
+ 0.0,
66
+ 0.0
67
+ ],
68
+ "q01": [
69
+ -7.156214237213135,
70
+ -5.257390398583084e-07,
71
+ -2.8215323254698887e-05,
72
+ -1.8530020713806152,
73
+ -1.3616564273834229,
74
+ -6.243625698089599,
75
+ -8.494686126708984,
76
+ -0.5754004126787186,
77
+ -2.81171942333458e-05,
78
+ -1.8067627024650574,
79
+ -1.4502456188201904,
80
+ -5.74780608177185,
81
+ 0.0,
82
+ 0.0
83
+ ],
84
+ "q99": [
85
+ 0.4322364914417267,
86
+ 3.528747615814209,
87
+ 4.213814439773559,
88
+ 1.6591367983818048,
89
+ 1.4808999300003052,
90
+ 2.9189868807792663,
91
+ 1.2362913405895235,
92
+ 3.00386118888855,
93
+ 4.1129137754440315,
94
+ 1.75497855067253,
95
+ 1.501461386680603,
96
+ 3.7943292021751405,
97
+ 1.0,
98
+ 1.0
99
+ ],
100
+ "mask": [
101
+ true,
102
+ true,
103
+ true,
104
+ true,
105
+ true,
106
+ true,
107
+ true,
108
+ true,
109
+ true,
110
+ true,
111
+ true,
112
+ true,
113
+ false,
114
+ false
115
+ ]
116
+ },
117
+ "state": {
118
+ "mean": [
119
+ -0.23170382969081404,
120
+ 1.0965768384933474,
121
+ 0.7819626295566559,
122
+ -0.31852622993290425,
123
+ 0.057760832709902836,
124
+ -0.055021945205517134,
125
+ 0.20828876227140425,
126
+ 1.0905675184726715,
127
+ 0.7958361715078353,
128
+ -0.34572803400456903,
129
+ -0.02242892236566149,
130
+ 0.058168093403801316,
131
+ 0.6732750406861303,
132
+ 0.6677672982215882
133
+ ],
134
+ "std": [
135
+ 0.4041338455301996,
136
+ 1.006313901997396,
137
+ 0.7722665737866291,
138
+ 0.6693469932644355,
139
+ 0.2816361902175701,
140
+ 0.6729632740733544,
141
+ 0.39232694117902944,
142
+ 1.0205017587198142,
143
+ 0.7927670273279362,
144
+ 0.684256277696324,
145
+ 0.24975242963368358,
146
+ 0.6782357193592726,
147
+ 0.4496057394878301,
148
+ 0.4514107074270294
149
+ ],
150
+ "max": [
151
+ 0.4363388121128082,
152
+ 3.896630048751831,
153
+ 4.553252220153809,
154
+ 1.791752576828003,
155
+ 1.6647447347640991,
156
+ 4.326117515563965,
157
+ 3.3414716720581055,
158
+ 3.5858347415924072,
159
+ 5.672450065612793,
160
+ 1.9440714120864868,
161
+ 1.5042771100997925,
162
+ 3.819632053375244,
163
+ 1.0,
164
+ 1.0
165
+ ],
166
+ "min": [
167
+ -7.321954727172852,
168
+ -0.00418000016361475,
169
+ -0.0149909146130085,
170
+ -1.9549700021743774,
171
+ -1.43248450756073,
172
+ -7.091593265533447,
173
+ -8.539926528930664,
174
+ -0.5945725440979004,
175
+ -0.07252676039934158,
176
+ -2.0857622623443604,
177
+ -2.047459840774536,
178
+ -6.275933742523193,
179
+ 0.0,
180
+ 0.0
181
+ ],
182
+ "q01": [
183
+ -7.156214237213135,
184
+ -5.257390398583084e-07,
185
+ -2.8215323254698887e-05,
186
+ -1.8530020713806152,
187
+ -1.3616564273834229,
188
+ -6.243625698089599,
189
+ -8.494686126708984,
190
+ -0.5754004126787186,
191
+ -2.81171942333458e-05,
192
+ -1.8009709119796753,
193
+ -1.4502456188201904,
194
+ -5.647760705947876,
195
+ 0.0,
196
+ 0.0
197
+ ],
198
+ "q99": [
199
+ 0.4317424774169923,
200
+ 3.5283490157127373,
201
+ 4.2126740026473986,
202
+ 1.6591367983818048,
203
+ 1.4808999300003052,
204
+ 2.9188456654548647,
205
+ 1.2358578193187715,
206
+ 3.00386118888855,
207
+ 4.1129137754440315,
208
+ 1.7217634475231163,
209
+ 1.501461386680603,
210
+ 3.793578088283539,
211
+ 1.0,
212
+ 1.0
213
+ ]
214
+ },
215
+ "num_transitions": 552050,
216
+ "num_trajectories": 2500
217
+ }
218
+ }
run_robotwin_train.sh ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # export NCCL_SOCKET_IFNAME=bond0
4
+ # export NCCL_IB_HCA=mlx5_2,mlx5_3
5
+
6
+ # used for check save when communication
7
+ export NCCL_BLOCKING_WAIT=1
8
+ export NCCL_ASYNC_ERROR_HANDLING=1
9
+ export NCCL_TIMEOUT=1000 # timeout set to 1 hour (unit: seconds)
10
+ export NCCL_SOCKET_TIMEOUT_MS=360000
11
+ export NCCL_P2P_DISABLE=1
12
+ # export NCCL_DEBUG=INFO
13
+ # export NCCL_DEBUG_SUBSYS=ALL
14
+ # export TORCH_DISTRIBUTED_DEBUG=DETAIL
15
+
16
+ ###########################################################################################
17
+ # === Please modify the following paths according to your environment ===
18
+ Framework_name=QwenOFT
19
+ freeze_module_list=''
20
+ base_vlm=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/model/spiritv1.5
21
+ config_yaml=./examples/Robotwin/train_files/starvla_cotrain_robotwin.yaml
22
+ robotwin_data_root=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/DATASET/robotwin_lerobot
23
+ run_root_dir=/inspire/ssd/project/embodied-basic-model/zhangjianing-253108140206/experiment/spirit_vla/starvla-vla/results
24
+ data_mix=robotwin
25
+ run_id=124_${data_mix}_spirit
26
+ # === End of environment variable configuration ===
27
+ ###########################################################################################
28
+
29
+ #batchsize=24
30
+ export WANDB_MODE=disabled
31
+
32
+ output_dir=${run_root_dir}/${run_id}
33
+ mkdir -p ${output_dir}
34
+ # mv this script to the output dir
35
+ cp $0 ${output_dir}/
36
+ #这里的数据没有put_object_dustbin和scan objects 改了mixtures
37
+ #bash examples/Robotwin/train_files/run_robotwin_train.sh
38
+ accelerate launch \
39
+ --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
40
+ --num_processes 4 \
41
+ starVLA/training/train_starvla.py \
42
+ --config_yaml ${config_yaml} \
43
+ --framework.name ${Framework_name} \
44
+ --framework.qwenvl.base_vlm ${base_vlm} \
45
+ --datasets.vla_data.per_device_batch_size 8 \
46
+ --datasets.vla_data.data_mix ${data_mix} \
47
+ --datasets.vla_data.data_root_dir ${robotwin_data_root}\
48
+ --trainer.freeze_modules ${freeze_module_list} \
49
+ --trainer.max_train_steps 30000 \
50
+ --trainer.save_interval 10000 \
51
+ --trainer.logging_frequency 100 \
52
+ --trainer.eval_interval 1000 \
53
+ --run_root_dir ${run_root_dir} \
54
+ --run_id ${run_id} \
55
+ --wandb_project spirit \
56
+ --wandb_entity 1732949190-tongji-university \
57
+ # --is_debug True
58
+
59
+
60
+
61
+ ##### Multi-Server Multi-GPU training script #####
62
+ # accelerate launch \
63
+ # --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
64
+ # --main_process_ip $MASTER_ADDR \
65
+ # --main_process_port $MASTER_PORT \
66
+ # --machine_rank $SLURM_PROCID \
67
+ # --num_machines $SLURM_NNODES \
68
+ # --num_processes=${TOTAL_GPUS} \
69
+ # starVLA/training/train_starvla.py \
70
+ # --config_yaml ${config_yaml} \
71
+ # --framework.name ${Framework_name} \
72
+ # --framework.qwenvl.base_vlm ${base_vlm} \
73
+ # --run_root_dir ${run_root_dir} \
74
+ # --run_id ${run_id} \
75
+ # --wandb_project your_project \
76
+ # --wandb_entity your_name
77
+ ##### Multi-Server Multi-GPU training script #####
summary.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"steps": 10000}
2
+ {"steps": 20000}
3
+ {"steps": 30000}