ducido commited on
Commit
6d982dc
·
verified ·
1 Parent(s): 3f6cb19

Initial commit

Browse files
Files changed (3) hide show
  1. config.json +134 -0
  2. model.safetensors +3 -0
  3. train_config.json +327 -0
config.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "smolvla",
3
+ "n_obs_steps": 1,
4
+ "normalization_mapping": {
5
+ "VISUAL": "IDENTITY",
6
+ "STATE": "MEAN_STD",
7
+ "ACTION": "MEAN_STD"
8
+ },
9
+ "input_features": {
10
+ "observation.images.image": {
11
+ "type": "VISUAL",
12
+ "shape": [
13
+ 3,
14
+ 256,
15
+ 256
16
+ ]
17
+ },
18
+ "observation.images.wrist_image": {
19
+ "type": "VISUAL",
20
+ "shape": [
21
+ 3,
22
+ 256,
23
+ 256
24
+ ]
25
+ },
26
+ "observation.images.image_mask": {
27
+ "type": "VISUAL",
28
+ "shape": [
29
+ 3,
30
+ 256,
31
+ 256
32
+ ]
33
+ },
34
+ "observation.images.wrist_mask": {
35
+ "type": "VISUAL",
36
+ "shape": [
37
+ 3,
38
+ 256,
39
+ 256
40
+ ]
41
+ },
42
+ "observation.images.object_of_interest_mask": {
43
+ "type": "VISUAL",
44
+ "shape": [
45
+ 3,
46
+ 256,
47
+ 256
48
+ ]
49
+ },
50
+ "observation.images.object_of_interest_wrist_mask": {
51
+ "type": "VISUAL",
52
+ "shape": [
53
+ 3,
54
+ 256,
55
+ 256
56
+ ]
57
+ },
58
+ "observation.state": {
59
+ "type": "STATE",
60
+ "shape": [
61
+ 8
62
+ ]
63
+ },
64
+ "observation.states.ee_state": {
65
+ "type": "STATE",
66
+ "shape": [
67
+ 6
68
+ ]
69
+ },
70
+ "observation.states.joint_state": {
71
+ "type": "STATE",
72
+ "shape": [
73
+ 7
74
+ ]
75
+ },
76
+ "observation.states.gripper_state": {
77
+ "type": "STATE",
78
+ "shape": [
79
+ 2
80
+ ]
81
+ }
82
+ },
83
+ "output_features": {
84
+ "action": {
85
+ "type": "ACTION",
86
+ "shape": [
87
+ 7
88
+ ]
89
+ }
90
+ },
91
+ "device": "cuda",
92
+ "use_amp": false,
93
+ "gradient_accumulation_steps": 1,
94
+ "chunk_size": 50,
95
+ "n_action_steps": 1,
96
+ "max_state_dim": 32,
97
+ "max_action_dim": 32,
98
+ "resize_imgs_with_padding": [
99
+ 512,
100
+ 512
101
+ ],
102
+ "empty_cameras": 0,
103
+ "adapt_to_pi_aloha": false,
104
+ "use_delta_joint_actions_aloha": false,
105
+ "tokenizer_max_length": 48,
106
+ "num_steps": 10,
107
+ "use_cache": true,
108
+ "freeze_vision_encoder": true,
109
+ "train_expert_only": false,
110
+ "train_state_proj": true,
111
+ "optimizer_lr": 0.0001,
112
+ "optimizer_betas": [
113
+ 0.9,
114
+ 0.95
115
+ ],
116
+ "optimizer_eps": 1e-08,
117
+ "optimizer_weight_decay": 1e-10,
118
+ "optimizer_grad_clip_norm": 10,
119
+ "scheduler_warmup_steps": 1000,
120
+ "scheduler_decay_steps": 100000,
121
+ "scheduler_decay_lr": 2.5e-06,
122
+ "vlm_model_name": "/pfss/mlde/workspaces/mlde_wsp_IAS_SAMMerge/VLA/duc/VLA-Humanoid-MW/SmolVLM2-500M-Video-Instruct",
123
+ "load_vlm_weights": true,
124
+ "add_image_special_tokens": false,
125
+ "attention_mode": "cross_attn",
126
+ "prefix_length": 0,
127
+ "pad_language_to": "max_length",
128
+ "num_expert_layers": 0,
129
+ "num_vlm_layers": 16,
130
+ "self_attn_every_n_layers": 2,
131
+ "expert_width_multiplier": 0.75,
132
+ "min_period": 0.004,
133
+ "max_period": 4.0
134
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8497d0b178b56284e3b72226c1d3868cd77b471616bc3919f87d1bc85339d476
3
+ size 906714192
train_config.json ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": {
3
+ "repo_id": ".",
4
+ "root": "/pfss/mlde/workspaces/mlde_wsp_IAS_SAMMerge/VLA/LIBERO/merged_libero_scale_100_mask_depth_noops_lerobot",
5
+ "episodes": null,
6
+ "image_transforms": {
7
+ "enable": true,
8
+ "max_num_transforms": 3,
9
+ "random_order": false,
10
+ "image_tfs": {
11
+ "hue": {
12
+ "weight": 1.0,
13
+ "type": "ColorJitter",
14
+ "kwargs": {
15
+ "hue": [
16
+ -0.05,
17
+ 0.05
18
+ ]
19
+ }
20
+ },
21
+ "contrast": {
22
+ "weight": 1.0,
23
+ "type": "ColorJitter",
24
+ "kwargs": {
25
+ "contrast": [
26
+ 0.8,
27
+ 1.2
28
+ ]
29
+ }
30
+ },
31
+ "sharpness": {
32
+ "weight": 1.0,
33
+ "type": "SharpnessJitter",
34
+ "kwargs": {
35
+ "sharpness": [
36
+ 0.5,
37
+ 1.5
38
+ ]
39
+ }
40
+ },
41
+ "brightness": {
42
+ "weight": 1.0,
43
+ "type": "ColorJitter",
44
+ "kwargs": {
45
+ "brightness": [
46
+ 0.8,
47
+ 1.2
48
+ ]
49
+ }
50
+ },
51
+ "saturation": {
52
+ "weight": 1.0,
53
+ "type": "ColorJitter",
54
+ "kwargs": {
55
+ "saturation": [
56
+ 0.5,
57
+ 1.5
58
+ ]
59
+ }
60
+ },
61
+ "crop_resize": {
62
+ "weight": 1.0,
63
+ "type": "RandomResizedCrop",
64
+ "kwargs": {
65
+ "size": [
66
+ 256,
67
+ 256
68
+ ],
69
+ "ratio": [
70
+ 1,
71
+ 1
72
+ ],
73
+ "scale": [
74
+ 0.9,
75
+ 0.95
76
+ ]
77
+ }
78
+ },
79
+ "rotate": {
80
+ "weight": 1.0,
81
+ "type": "RandomRotate",
82
+ "kwargs": {
83
+ "degrees": [
84
+ -5,
85
+ 5
86
+ ]
87
+ }
88
+ }
89
+ },
90
+ "wrist_tfs": {
91
+ "hue": {
92
+ "weight": 1.0,
93
+ "type": "ColorJitter",
94
+ "kwargs": {
95
+ "hue": [
96
+ -0.05,
97
+ 0.05
98
+ ]
99
+ }
100
+ },
101
+ "contrast": {
102
+ "weight": 1.0,
103
+ "type": "ColorJitter",
104
+ "kwargs": {
105
+ "contrast": [
106
+ 0.8,
107
+ 1.2
108
+ ]
109
+ }
110
+ },
111
+ "sharpness": {
112
+ "weight": 1.0,
113
+ "type": "SharpnessJitter",
114
+ "kwargs": {
115
+ "sharpness": [
116
+ 0.5,
117
+ 1.5
118
+ ]
119
+ }
120
+ },
121
+ "brightness": {
122
+ "weight": 1.0,
123
+ "type": "ColorJitter",
124
+ "kwargs": {
125
+ "brightness": [
126
+ 0.8,
127
+ 1.2
128
+ ]
129
+ }
130
+ },
131
+ "saturation": {
132
+ "weight": 1.0,
133
+ "type": "ColorJitter",
134
+ "kwargs": {
135
+ "saturation": [
136
+ 0.5,
137
+ 1.5
138
+ ]
139
+ }
140
+ }
141
+ }
142
+ },
143
+ "revision": null,
144
+ "use_imagenet_stats": true,
145
+ "video_backend": "torchcodec",
146
+ "vqa_data_path": null
147
+ },
148
+ "env": null,
149
+ "policy": {
150
+ "type": "smolvla",
151
+ "n_obs_steps": 1,
152
+ "normalization_mapping": {
153
+ "VISUAL": "IDENTITY",
154
+ "STATE": "MEAN_STD",
155
+ "ACTION": "MEAN_STD"
156
+ },
157
+ "input_features": {
158
+ "observation.images.image": {
159
+ "type": "VISUAL",
160
+ "shape": [
161
+ 3,
162
+ 256,
163
+ 256
164
+ ]
165
+ },
166
+ "observation.images.wrist_image": {
167
+ "type": "VISUAL",
168
+ "shape": [
169
+ 3,
170
+ 256,
171
+ 256
172
+ ]
173
+ },
174
+ "observation.images.image_mask": {
175
+ "type": "VISUAL",
176
+ "shape": [
177
+ 3,
178
+ 256,
179
+ 256
180
+ ]
181
+ },
182
+ "observation.images.wrist_mask": {
183
+ "type": "VISUAL",
184
+ "shape": [
185
+ 3,
186
+ 256,
187
+ 256
188
+ ]
189
+ },
190
+ "observation.images.object_of_interest_mask": {
191
+ "type": "VISUAL",
192
+ "shape": [
193
+ 3,
194
+ 256,
195
+ 256
196
+ ]
197
+ },
198
+ "observation.images.object_of_interest_wrist_mask": {
199
+ "type": "VISUAL",
200
+ "shape": [
201
+ 3,
202
+ 256,
203
+ 256
204
+ ]
205
+ },
206
+ "observation.state": {
207
+ "type": "STATE",
208
+ "shape": [
209
+ 8
210
+ ]
211
+ },
212
+ "observation.states.ee_state": {
213
+ "type": "STATE",
214
+ "shape": [
215
+ 6
216
+ ]
217
+ },
218
+ "observation.states.joint_state": {
219
+ "type": "STATE",
220
+ "shape": [
221
+ 7
222
+ ]
223
+ },
224
+ "observation.states.gripper_state": {
225
+ "type": "STATE",
226
+ "shape": [
227
+ 2
228
+ ]
229
+ }
230
+ },
231
+ "output_features": {
232
+ "action": {
233
+ "type": "ACTION",
234
+ "shape": [
235
+ 7
236
+ ]
237
+ }
238
+ },
239
+ "device": "cuda",
240
+ "use_amp": false,
241
+ "gradient_accumulation_steps": 1,
242
+ "chunk_size": 50,
243
+ "n_action_steps": 1,
244
+ "max_state_dim": 32,
245
+ "max_action_dim": 32,
246
+ "resize_imgs_with_padding": [
247
+ 512,
248
+ 512
249
+ ],
250
+ "empty_cameras": 0,
251
+ "adapt_to_pi_aloha": false,
252
+ "use_delta_joint_actions_aloha": false,
253
+ "tokenizer_max_length": 48,
254
+ "num_steps": 10,
255
+ "use_cache": true,
256
+ "freeze_vision_encoder": true,
257
+ "train_expert_only": false,
258
+ "train_state_proj": true,
259
+ "optimizer_lr": 0.0001,
260
+ "optimizer_betas": [
261
+ 0.9,
262
+ 0.95
263
+ ],
264
+ "optimizer_eps": 1e-08,
265
+ "optimizer_weight_decay": 1e-10,
266
+ "optimizer_grad_clip_norm": 10,
267
+ "scheduler_warmup_steps": 1000,
268
+ "scheduler_decay_steps": 100000,
269
+ "scheduler_decay_lr": 2.5e-06,
270
+ "vlm_model_name": "/pfss/mlde/workspaces/mlde_wsp_IAS_SAMMerge/VLA/duc/VLA-Humanoid-MW/SmolVLM2-500M-Video-Instruct",
271
+ "load_vlm_weights": true,
272
+ "add_image_special_tokens": false,
273
+ "attention_mode": "cross_attn",
274
+ "prefix_length": 0,
275
+ "pad_language_to": "max_length",
276
+ "num_expert_layers": 0,
277
+ "num_vlm_layers": 16,
278
+ "self_attn_every_n_layers": 2,
279
+ "expert_width_multiplier": 0.75,
280
+ "min_period": 0.004,
281
+ "max_period": 4.0
282
+ },
283
+ "output_dir": "outputs/train/2025-12-31/04-57-32_libero_100%_baseline_scratch_unfreezevlm_100kdecay",
284
+ "job_name": "libero_100%_baseline_scratch_unfreezevlm_100kdecay",
285
+ "resume": false,
286
+ "seed": 42,
287
+ "num_workers": 8,
288
+ "batch_size": 64,
289
+ "steps": 100000,
290
+ "eval_freq": 20000,
291
+ "log_freq": 200,
292
+ "save_checkpoint": true,
293
+ "save_freq": 10000,
294
+ "use_policy_training_preset": true,
295
+ "optimizer": {
296
+ "type": "adamw",
297
+ "lr": 0.0001,
298
+ "weight_decay": 1e-10,
299
+ "grad_clip_norm": 10,
300
+ "betas": [
301
+ 0.9,
302
+ 0.95
303
+ ],
304
+ "eps": 1e-08
305
+ },
306
+ "scheduler": {
307
+ "type": "cosine_decay_with_warmup",
308
+ "num_warmup_steps": 1000,
309
+ "num_decay_steps": 100000,
310
+ "peak_lr": 0.0001,
311
+ "decay_lr": 2.5e-06
312
+ },
313
+ "eval": {
314
+ "n_episodes": 50,
315
+ "batch_size": 50,
316
+ "use_async_envs": false
317
+ },
318
+ "wandb": {
319
+ "enable": true,
320
+ "disable_artifact": true,
321
+ "project": "smolvla",
322
+ "entity": "Robotics_VLA",
323
+ "notes": null,
324
+ "run_id": null,
325
+ "mode": "online"
326
+ }
327
+ }