lihao3 commited on
Commit
dc495af
·
0 Parent(s):

Add model checkpoint

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pt filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - openvla/openvla-7b-prismatic
4
+ - CogACT/CogACT-Base
5
+ tags:
6
+ - robotics
7
+ - vla
8
+ - multimodal
9
+ - pretraining
10
+ pipeline_tag: robotics
11
+ ---
12
+ # Model Card for CronusVLA
13
+
14
+ **Weights**
15
+
16
+ `step2/checkpoints/reshape_embedding_step2.pt`:
17
+
18
+ - (1) This checkpoint serves as the starting point for fine-tuning on the Libero benchmark.
19
+ - (2) It is a post-trained model derived from CronusVLA-7B, configured with a two-step frame (history length = 1).
20
+ - (3) During the original post-training phase, the model was trained to predict future actions with a chunk size of 16. To ensure compatibility with the Libero setting of OpenVLA-oft (chunk = 8), we modified the position embeddings responsible for action chunking in the original checkpoint.
21
+ - (4) Note: This checkpoint is not directly evaluable and is intended only as a fine-tuning initialization.
22
+
23
+ `step4/checkpoints/reshape_embedding_step4.pt`:
24
+
25
+ - (1) This checkpoint serves as the starting point for fine-tuning on the LIBERO benchmark.
26
+ - (2) It is a post-trained model derived from CronusVLA-7B, configured with a four-step frame (history length = 3).
27
+ - (3) During the original post-training phase, the model was trained to predict future actions with a chunk size of 16. To ensure compatibility with the Libero setting of OpenVLA-oft (chunk = 8), we modified the position embeddings responsible for action chunking in the original checkpoint.
28
+ - (4) Note: This checkpoint is not directly evaluable and is intended only as a fine-tuning initialization.
29
+
30
+ > If you want to use these checkpoint to finetune, please follow the instruction of [CronusVLA](https://github.com/InternRobotics/CronusVLA).
step2/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - openvla/openvla-7b-prismatic
4
+ - CogACT/CogACT-Base
5
+ tags:
6
+ - robotics
7
+ - vla
8
+ - multimodal
9
+ - pretraining
10
+ pipeline_tag: robotics
11
+ ---
12
+ # Model Card for CronusVLA
13
+
14
+ **Weights**
15
+
16
+ `checkpoints/reshape_embedding_step2.pt`:
17
+
18
+ - (1) This checkpoint serves as the starting point for fine-tuning on the Libero benchmark.
19
+ - (2) It is a post-trained model derived from CronusVLA-7B, configured with a two-step frame (history length = 1).
20
+ - (3) During the original post-training phase, the model was trained to predict future actions with a chunk size of 16. To ensure compatibility with the Libero setting of OpenVLA-oft (chunk = 8), we modified the position embeddings responsible for action chunking in the original checkpoint.
21
+ - (4) Note: This checkpoint is not directly evaluable and is intended only as a fine-tuning initialization.
22
+
23
+ > If you want to use these checkpoint to finetune, please follow the instruction of [CronusVLA](https://github.com/InternRobotics/CronusVLA).
step2/checkpoints/reshape_embedding_step2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc133922d2b5fce6ef42476b4557d26de4789da444cf24afff1417dc05e9f612
3
+ size 30703958828
step2/config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 7,
3
+ "action_model_type": "DiT-B",
4
+ "data_root_dir": "",
5
+ "debug": false,
6
+ "future_action_window_size": 15,
7
+ "hf_token": "hf_token",
8
+ "image_aug": true,
9
+ "is_resume": false,
10
+ "load_all_data_for_training": true,
11
+ "past_action_window_size": 1,
12
+ "pretrained_checkpoint": "path/to/ckpt",
13
+ "repeated_diffusion_steps": 4,
14
+ "resume_epoch": null,
15
+ "resume_step": null,
16
+ "run_id": "step2_7B",
17
+ "run_id_note": null,
18
+ "run_root_dir": "outputs/step2_7B",
19
+ "save_interval": 2500,
20
+ "seed": 42,
21
+ "trackers": [
22
+ "jsonl",
23
+ "wandb"
24
+ ],
25
+ "use_ema": false,
26
+ "vla": {
27
+ "base_vlm": "prism-dinosiglip-224px+7b",
28
+ "data_mix": "bridge_rt_1",
29
+ "enable_gradient_checkpointing": true,
30
+ "enable_mixed_precision_training": true,
31
+ "epochs": 100,
32
+ "expected_world_size": 16,
33
+ "freeze_llm_backbone": false,
34
+ "freeze_vision_backbone": false,
35
+ "global_batch_size": 256,
36
+ "learning_rate": 2e-05,
37
+ "lr_scheduler_type": "constant",
38
+ "max_grad_norm": 1.0,
39
+ "max_steps": null,
40
+ "per_device_batch_size": 16,
41
+ "reduce_in_full_precision": true,
42
+ "shuffle_buffer_size": 250000,
43
+ "train_strategy": "fsdp-full-shard",
44
+ "type": "prism-dinosiglip-224px+oxe+diffusion",
45
+ "unfreeze_last_llm_layer": false,
46
+ "vla_id": "prism-dinosiglip-224px+oxe+diffusion",
47
+ "warmup_ratio": 0.0,
48
+ "weight_decay": 0.0
49
+ },
50
+ "wandb_entity": "",
51
+ "wandb_project": ""
52
+ }
step2/config.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ action_dim: 7
2
+ action_model_type: DiT-B
3
+ data_root_dir: ""
4
+ debug: false
5
+ future_action_window_size: 15
6
+ hf_token: hf_token
7
+ image_aug: true
8
+ is_resume: false
9
+ load_all_data_for_training: true
10
+ past_action_window_size: 1
11
+ pretrained_checkpoint: "path/to/ckpt"
12
+ repeated_diffusion_steps: 4
13
+ resume_epoch: null
14
+ resume_step: null
15
+ run_id: step2_7B
16
+ run_id_note: null
17
+ run_root_dir: outputs/step2_7B
18
+ save_interval: 2500
19
+ seed: 42
20
+ trackers:
21
+ - jsonl
22
+ - wandb
23
+ use_ema: false
24
+ vla:
25
+ base_vlm: prism-dinosiglip-224px+7b
26
+ data_mix: bridge_rt_1
27
+ enable_gradient_checkpointing: true
28
+ enable_mixed_precision_training: true
29
+ epochs: 100
30
+ expected_world_size: 16
31
+ freeze_llm_backbone: false
32
+ freeze_vision_backbone: false
33
+ global_batch_size: 256
34
+ learning_rate: 2.0e-05
35
+ lr_scheduler_type: constant
36
+ max_grad_norm: 1.0
37
+ max_steps: null
38
+ per_device_batch_size: 16
39
+ reduce_in_full_precision: true
40
+ shuffle_buffer_size: 250000
41
+ train_strategy: fsdp-full-shard
42
+ type: prism-dinosiglip-224px+oxe+diffusion
43
+ unfreeze_last_llm_layer: false
44
+ vla_id: prism-dinosiglip-224px+oxe+diffusion
45
+ warmup_ratio: 0.0
46
+ weight_decay: 0.0
47
+ wandb_entity: ""
48
+ wandb_project: ""
step2/dataset_statistics.json ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bridge_dataset": {
3
+ "action": {
4
+ "mean": [
5
+ 0.00023341973428614438,
6
+ 0.0001300475705647841,
7
+ -0.00012762400729116052,
8
+ -0.00015565499779768288,
9
+ -0.00040393517701886594,
10
+ 0.0002355772303417325,
11
+ 0.5764579772949219
12
+ ],
13
+ "std": [
14
+ 0.009765934199094772,
15
+ 0.013689189217984676,
16
+ 0.012667394243180752,
17
+ 0.028534121811389923,
18
+ 0.030637938529253006,
19
+ 0.07691467553377151,
20
+ 0.4973696768283844
21
+ ],
22
+ "max": [
23
+ 0.41691166162490845,
24
+ 0.25864794850349426,
25
+ 0.21218234300613403,
26
+ 3.122201919555664,
27
+ 1.8618112802505493,
28
+ 6.280478477478027,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -0.4007510244846344,
33
+ -0.13874775171279907,
34
+ -0.22553899884223938,
35
+ -3.2010786533355713,
36
+ -1.8618112802505493,
37
+ -6.279075622558594,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.02872725307941437,
42
+ -0.04170349963009357,
43
+ -0.026093858778476715,
44
+ -0.08092105075716972,
45
+ -0.09288699507713317,
46
+ -0.20718276381492615,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.028309678435325586,
51
+ 0.040855254605412394,
52
+ 0.040161586627364146,
53
+ 0.08192047759890528,
54
+ 0.07792850524187081,
55
+ 0.20382574498653397,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "proprio": {
69
+ "mean": [
70
+ 0.0,
71
+ 0.0,
72
+ 0.0,
73
+ 0.0,
74
+ 0.0,
75
+ 0.0,
76
+ 0.0
77
+ ],
78
+ "std": [
79
+ 0.0,
80
+ 0.0,
81
+ 0.0,
82
+ 0.0,
83
+ 0.0,
84
+ 0.0,
85
+ 0.0
86
+ ],
87
+ "max": [
88
+ 0.0,
89
+ 0.0,
90
+ 0.0,
91
+ 0.0,
92
+ 0.0,
93
+ 0.0,
94
+ 0.0
95
+ ],
96
+ "min": [
97
+ 0.0,
98
+ 0.0,
99
+ 0.0,
100
+ 0.0,
101
+ 0.0,
102
+ 0.0,
103
+ 0.0
104
+ ],
105
+ "q01": [
106
+ 0.0,
107
+ 0.0,
108
+ 0.0,
109
+ 0.0,
110
+ 0.0,
111
+ 0.0,
112
+ 0.0
113
+ ],
114
+ "q99": [
115
+ 0.0,
116
+ 0.0,
117
+ 0.0,
118
+ 0.0,
119
+ 0.0,
120
+ 0.0,
121
+ 0.0
122
+ ]
123
+ },
124
+ "num_transitions": 2135463,
125
+ "num_trajectories": 60064
126
+ },
127
+ "fractal20220817_data": {
128
+ "action": {
129
+ "mean": [
130
+ 0.006987491622567177,
131
+ 0.00626587588340044,
132
+ -0.012625089846551418,
133
+ 0.04333178699016571,
134
+ -0.005756180267781019,
135
+ 0.0009131028782576323,
136
+ 0.5354204773902893
137
+ ],
138
+ "std": [
139
+ 0.06921201944351196,
140
+ 0.059655144810676575,
141
+ 0.0735311210155487,
142
+ 0.15610052645206451,
143
+ 0.131641685962677,
144
+ 0.14593306183815002,
145
+ 0.49710750579833984
146
+ ],
147
+ "max": [
148
+ 2.9984593391418457,
149
+ 22.09052848815918,
150
+ 2.7507524490356445,
151
+ 1.570636510848999,
152
+ 1.5321086645126343,
153
+ 1.5691522359848022,
154
+ 1.0
155
+ ],
156
+ "min": [
157
+ -2.0204520225524902,
158
+ -5.497899532318115,
159
+ -2.031663417816162,
160
+ -1.569917917251587,
161
+ -1.569892168045044,
162
+ -1.570419430732727,
163
+ 0.0
164
+ ],
165
+ "q01": [
166
+ -0.22453527510166169,
167
+ -0.14820013284683228,
168
+ -0.231589707583189,
169
+ -0.3517994859814644,
170
+ -0.4193011274933815,
171
+ -0.43643461108207704,
172
+ 0.0
173
+ ],
174
+ "q99": [
175
+ 0.17824687153100965,
176
+ 0.14938379630446405,
177
+ 0.21842354819178575,
178
+ 0.5892666035890578,
179
+ 0.35272657424211445,
180
+ 0.44796681255102094,
181
+ 1.0
182
+ ],
183
+ "mask": [
184
+ true,
185
+ true,
186
+ true,
187
+ true,
188
+ true,
189
+ true,
190
+ false
191
+ ]
192
+ },
193
+ "proprio": {
194
+ "mean": [
195
+ 0.0,
196
+ 0.0,
197
+ 0.0,
198
+ 0.0,
199
+ 0.0,
200
+ 0.0,
201
+ 0.0
202
+ ],
203
+ "std": [
204
+ 0.0,
205
+ 0.0,
206
+ 0.0,
207
+ 0.0,
208
+ 0.0,
209
+ 0.0,
210
+ 0.0
211
+ ],
212
+ "max": [
213
+ 0.0,
214
+ 0.0,
215
+ 0.0,
216
+ 0.0,
217
+ 0.0,
218
+ 0.0,
219
+ 0.0
220
+ ],
221
+ "min": [
222
+ 0.0,
223
+ 0.0,
224
+ 0.0,
225
+ 0.0,
226
+ 0.0,
227
+ 0.0,
228
+ 0.0
229
+ ],
230
+ "q01": [
231
+ 0.0,
232
+ 0.0,
233
+ 0.0,
234
+ 0.0,
235
+ 0.0,
236
+ 0.0,
237
+ 0.0
238
+ ],
239
+ "q99": [
240
+ 0.0,
241
+ 0.0,
242
+ 0.0,
243
+ 0.0,
244
+ 0.0,
245
+ 0.0,
246
+ 0.0
247
+ ]
248
+ },
249
+ "num_transitions": 3786400,
250
+ "num_trajectories": 87212
251
+ }
252
+ }
step2/run-metrics.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hparams": {"action_dim": 7, "action_model_type": "DiT-B", "data_root_dir": "", "debug": false, "future_action_window_size": 15, "hf_token": "hf_token", "image_aug": true, "is_resume": false, "load_all_data_for_training": true, "past_action_window_size": 1, "pretrained_checkpoint": "path/to/ckpt", "repeated_diffusion_steps": 4, "resume_epoch": null, "resume_step": null, "run_id": "step2_7B", "run_id_note": null, "run_root_dir": "outputs/step2_7B", "save_interval": 5000, "seed": 42, "trackers": ["jsonl", "wandb"], "use_ema": false, "vla": {"base_vlm": "prism-dinosiglip-224px+7b", "data_mix": "bridge_rt_1", "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true, "epochs": 100, "expected_world_size": 16, "freeze_llm_backbone": false, "freeze_vision_backbone": false, "global_batch_size": 256, "learning_rate": 2e-05, "lr_scheduler_type": "constant", "max_grad_norm": 1.0, "max_steps": null, "per_device_batch_size": 16, "reduce_in_full_precision": true, "shuffle_buffer_size": 250000, "train_strategy": "fsdp-full-shard", "type": "prism-dinosiglip-224px+oxe+diffusion", "unfreeze_last_llm_layer": false, "vla_id": "prism-dinosiglip-224px+oxe+diffusion", "warmup_ratio": 0.0, "weight_decay": 0.0}, "wandb_entity": "", "wandb_project": ""}, "run_id": "step2_7B"}
step4/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - openvla/openvla-7b-prismatic
4
+ - CogACT/CogACT-Base
5
+ tags:
6
+ - robotics
7
+ - vla
8
+ - multimodal
9
+ - pretraining
10
+ pipeline_tag: robotics
11
+ ---
12
+ # Model Card for CronusVLA
13
+
14
+ **Weights**
15
+
16
+ `step4/checkpoints/reshape_embedding_step4.pt`:
17
+
18
+ - (1) This checkpoint serves as the starting point for fine-tuning on the Libero benchmark.
19
+ - (2) It is a post-trained model derived from CronusVLA-7B, configured with a four-step frame (history length = 3).
20
+ - (3) During the original post-training phase, the model was trained to predict future actions with a chunk size of 16. To ensure compatibility with the Libero setting of OpenVLA-oft (chunk = 8), we modified the position embeddings responsible for action chunking in the original checkpoint.
21
+ - (4) Note: This checkpoint is not directly evaluable and is intended only as a fine-tuning initialization.
22
+
23
+ > If you want to use these checkpoint to finetune, please follow the instruction of [CronusVLA](https://github.com/InternRobotics/CronusVLA).
step4/checkpoints/reshape_embedding_step4.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2142c243d7b87f29396bcedf242b2f5fd170bfca8895a773a70c5847c45a91c
3
+ size 30703997740
step4/config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 7,
3
+ "action_model_type": "DiT-B",
4
+ "data_root_dir": "",
5
+ "debug": false,
6
+ "future_action_window_size": 15,
7
+ "hf_token": "hf_token",
8
+ "image_aug": true,
9
+ "is_resume": false,
10
+ "load_all_data_for_training": true,
11
+ "past_action_window_size": 3,
12
+ "pretrained_checkpoint": "path/to/ckpt",
13
+ "repeated_diffusion_steps": 4,
14
+ "resume_epoch": null,
15
+ "resume_step": null,
16
+ "run_id": "step4_7B",
17
+ "run_id_note": null,
18
+ "run_root_dir": "outputs/step4_7B",
19
+ "save_interval": 2500,
20
+ "seed": 42,
21
+ "trackers": [
22
+ "jsonl",
23
+ "wandb"
24
+ ],
25
+ "use_ema": false,
26
+ "vla": {
27
+ "base_vlm": "prism-dinosiglip-224px+7b",
28
+ "data_mix": "bridge_rt_1",
29
+ "enable_gradient_checkpointing": true,
30
+ "enable_mixed_precision_training": true,
31
+ "epochs": 100,
32
+ "expected_world_size": 64,
33
+ "freeze_llm_backbone": false,
34
+ "freeze_vision_backbone": false,
35
+ "global_batch_size": 512,
36
+ "learning_rate": 4e-05,
37
+ "lr_scheduler_type": "constant",
38
+ "max_grad_norm": 1.0,
39
+ "max_steps": null,
40
+ "per_device_batch_size": 8,
41
+ "reduce_in_full_precision": true,
42
+ "shuffle_buffer_size": 250000,
43
+ "train_strategy": "fsdp-full-shard",
44
+ "type": "prism-dinosiglip-224px+oxe+diffusion",
45
+ "unfreeze_last_llm_layer": false,
46
+ "vla_id": "prism-dinosiglip-224px+oxe+diffusion",
47
+ "warmup_ratio": 0.0,
48
+ "weight_decay": 0.0
49
+ },
50
+ "wandb_entity": "",
51
+ "wandb_project": ""
52
+ }
step4/config.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ action_dim: 7
2
+ action_model_type: DiT-B
3
+ data_root_dir: ""
4
+ debug: false
5
+ future_action_window_size: 15
6
+ hf_token: hf_token
7
+ image_aug: true
8
+ is_resume: false
9
+ load_all_data_for_training: true
10
+ past_action_window_size: 3
11
+ pretrained_checkpoint: "path/to/ckpt"
12
+ repeated_diffusion_steps: 4
13
+ resume_epoch: null
14
+ resume_step: null
15
+ run_id: step4_7B
16
+ run_id_note: null
17
+ run_root_dir: outputs/step4_7B
18
+ save_interval: 2500
19
+ seed: 42
20
+ trackers:
21
+ - jsonl
22
+ - wandb
23
+ use_ema: false
24
+ vla:
25
+ base_vlm: prism-dinosiglip-224px+7b
26
+ data_mix: bridge_rt_1
27
+ enable_gradient_checkpointing: true
28
+ enable_mixed_precision_training: true
29
+ epochs: 100
30
+ expected_world_size: 64
31
+ freeze_llm_backbone: false
32
+ freeze_vision_backbone: false
33
+ global_batch_size: 512
34
+ learning_rate: 4.0e-05
35
+ lr_scheduler_type: constant
36
+ max_grad_norm: 1.0
37
+ max_steps: null
38
+ per_device_batch_size: 8
39
+ reduce_in_full_precision: true
40
+ shuffle_buffer_size: 250000
41
+ train_strategy: fsdp-full-shard
42
+ type: prism-dinosiglip-224px+oxe+diffusion
43
+ unfreeze_last_llm_layer: false
44
+ vla_id: prism-dinosiglip-224px+oxe+diffusion
45
+ warmup_ratio: 0.0
46
+ weight_decay: 0.0
47
+ wandb_entity: ""
48
+ wandb_project: ""
step4/dataset_statistics.json ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bridge_dataset": {
3
+ "action": {
4
+ "mean": [
5
+ 0.00023341973428614438,
6
+ 0.0001300475705647841,
7
+ -0.00012762400729116052,
8
+ -0.00015565499779768288,
9
+ -0.00040393517701886594,
10
+ 0.0002355772303417325,
11
+ 0.5764579772949219
12
+ ],
13
+ "std": [
14
+ 0.009765934199094772,
15
+ 0.013689189217984676,
16
+ 0.012667394243180752,
17
+ 0.028534121811389923,
18
+ 0.030637938529253006,
19
+ 0.07691467553377151,
20
+ 0.4973696768283844
21
+ ],
22
+ "max": [
23
+ 0.41691166162490845,
24
+ 0.25864794850349426,
25
+ 0.21218234300613403,
26
+ 3.122201919555664,
27
+ 1.8618112802505493,
28
+ 6.280478477478027,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -0.4007510244846344,
33
+ -0.13874775171279907,
34
+ -0.22553899884223938,
35
+ -3.2010786533355713,
36
+ -1.8618112802505493,
37
+ -6.279075622558594,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.02872725307941437,
42
+ -0.04170349963009357,
43
+ -0.026093858778476715,
44
+ -0.08092105075716972,
45
+ -0.09288699507713317,
46
+ -0.20718276381492615,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.028309678435325586,
51
+ 0.040855254605412394,
52
+ 0.040161586627364146,
53
+ 0.08192047759890528,
54
+ 0.07792850524187081,
55
+ 0.20382574498653397,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "proprio": {
69
+ "mean": [
70
+ 0.0,
71
+ 0.0,
72
+ 0.0,
73
+ 0.0,
74
+ 0.0,
75
+ 0.0,
76
+ 0.0
77
+ ],
78
+ "std": [
79
+ 0.0,
80
+ 0.0,
81
+ 0.0,
82
+ 0.0,
83
+ 0.0,
84
+ 0.0,
85
+ 0.0
86
+ ],
87
+ "max": [
88
+ 0.0,
89
+ 0.0,
90
+ 0.0,
91
+ 0.0,
92
+ 0.0,
93
+ 0.0,
94
+ 0.0
95
+ ],
96
+ "min": [
97
+ 0.0,
98
+ 0.0,
99
+ 0.0,
100
+ 0.0,
101
+ 0.0,
102
+ 0.0,
103
+ 0.0
104
+ ],
105
+ "q01": [
106
+ 0.0,
107
+ 0.0,
108
+ 0.0,
109
+ 0.0,
110
+ 0.0,
111
+ 0.0,
112
+ 0.0
113
+ ],
114
+ "q99": [
115
+ 0.0,
116
+ 0.0,
117
+ 0.0,
118
+ 0.0,
119
+ 0.0,
120
+ 0.0,
121
+ 0.0
122
+ ]
123
+ },
124
+ "num_transitions": 2135463,
125
+ "num_trajectories": 60064
126
+ },
127
+ "fractal20220817_data": {
128
+ "action": {
129
+ "mean": [
130
+ 0.006987491622567177,
131
+ 0.00626587588340044,
132
+ -0.012625089846551418,
133
+ 0.04333178699016571,
134
+ -0.005756180267781019,
135
+ 0.0009131028782576323,
136
+ 0.5354204773902893
137
+ ],
138
+ "std": [
139
+ 0.06921201944351196,
140
+ 0.059655144810676575,
141
+ 0.0735311210155487,
142
+ 0.15610052645206451,
143
+ 0.131641685962677,
144
+ 0.14593306183815002,
145
+ 0.49710750579833984
146
+ ],
147
+ "max": [
148
+ 2.9984593391418457,
149
+ 22.09052848815918,
150
+ 2.7507524490356445,
151
+ 1.570636510848999,
152
+ 1.5321086645126343,
153
+ 1.5691522359848022,
154
+ 1.0
155
+ ],
156
+ "min": [
157
+ -2.0204520225524902,
158
+ -5.497899532318115,
159
+ -2.031663417816162,
160
+ -1.569917917251587,
161
+ -1.569892168045044,
162
+ -1.570419430732727,
163
+ 0.0
164
+ ],
165
+ "q01": [
166
+ -0.22453527510166169,
167
+ -0.14820013284683228,
168
+ -0.231589707583189,
169
+ -0.3517994859814644,
170
+ -0.4193011274933815,
171
+ -0.43643461108207704,
172
+ 0.0
173
+ ],
174
+ "q99": [
175
+ 0.17824687153100965,
176
+ 0.14938379630446405,
177
+ 0.21842354819178575,
178
+ 0.5892666035890578,
179
+ 0.35272657424211445,
180
+ 0.44796681255102094,
181
+ 1.0
182
+ ],
183
+ "mask": [
184
+ true,
185
+ true,
186
+ true,
187
+ true,
188
+ true,
189
+ true,
190
+ false
191
+ ]
192
+ },
193
+ "proprio": {
194
+ "mean": [
195
+ 0.0,
196
+ 0.0,
197
+ 0.0,
198
+ 0.0,
199
+ 0.0,
200
+ 0.0,
201
+ 0.0
202
+ ],
203
+ "std": [
204
+ 0.0,
205
+ 0.0,
206
+ 0.0,
207
+ 0.0,
208
+ 0.0,
209
+ 0.0,
210
+ 0.0
211
+ ],
212
+ "max": [
213
+ 0.0,
214
+ 0.0,
215
+ 0.0,
216
+ 0.0,
217
+ 0.0,
218
+ 0.0,
219
+ 0.0
220
+ ],
221
+ "min": [
222
+ 0.0,
223
+ 0.0,
224
+ 0.0,
225
+ 0.0,
226
+ 0.0,
227
+ 0.0,
228
+ 0.0
229
+ ],
230
+ "q01": [
231
+ 0.0,
232
+ 0.0,
233
+ 0.0,
234
+ 0.0,
235
+ 0.0,
236
+ 0.0,
237
+ 0.0
238
+ ],
239
+ "q99": [
240
+ 0.0,
241
+ 0.0,
242
+ 0.0,
243
+ 0.0,
244
+ 0.0,
245
+ 0.0,
246
+ 0.0
247
+ ]
248
+ },
249
+ "num_transitions": 3786400,
250
+ "num_trajectories": 87212
251
+ }
252
+ }
step4/run-metrics.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hparams": {"action_dim": 7, "action_model_type": "DiT-B", "data_root_dir": "", "debug": false, "future_action_window_size": 15, "hf_token": "hf_token", "image_aug": true, "is_resume": false, "load_all_data_for_training": true, "past_action_window_size": 3, "pretrained_checkpoint": "path/to/ckpt", "repeated_diffusion_steps": 4, "resume_epoch": null, "resume_step": null, "run_id": "step4_7B", "run_id_note": null, "run_root_dir": "outputs/step4_7B", "save_interval": 2500, "seed": 42, "trackers": ["jsonl", "wandb"], "use_ema": false, "vla": {"base_vlm": "prism-dinosiglip-224px+7b", "data_mix": "bridge_rt_1", "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true, "epochs": 100, "expected_world_size": 64, "freeze_llm_backbone": false, "freeze_vision_backbone": false, "global_batch_size": 512, "learning_rate": 4e-05, "lr_scheduler_type": "constant", "max_grad_norm": 1.0, "max_steps": null, "per_device_batch_size": 8, "reduce_in_full_precision": true, "shuffle_buffer_size": 250000, "train_strategy": "fsdp-full-shard", "type": "prism-dinosiglip-224px+oxe+diffusion", "unfreeze_last_llm_layer": false, "vla_id": "prism-dinosiglip-224px+oxe+diffusion", "warmup_ratio": 0.0, "weight_decay": 0.0}, "wandb_entity": "", "wandb_project": ""}, "run_id": "step4_7B"}