cfu commited on
Commit
075ee14
·
verified ·
1 Parent(s): 7b31efd

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +5 -5
  2. config.json +167 -42
  3. model.safetensors +2 -2
  4. train_config.json +183 -54
README.md CHANGED
@@ -2,20 +2,20 @@
2
  datasets: cfu/record_test_ball
3
  library_name: lerobot
4
  license: apache-2.0
5
- model_name: diffusion
6
  pipeline_tag: robotics
7
  tags:
8
- - lerobot
9
- - diffusion
10
  - robotics
 
 
11
  ---
12
 
13
- # Model Card for diffusion
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
- [Diffusion Policy](https://huggingface.co/papers/2303.04137) treats visuomotor control as a generative diffusion process, producing smooth, multi-step action trajectories that excel at contact-rich manipulation.
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
2
  datasets: cfu/record_test_ball
3
  library_name: lerobot
4
  license: apache-2.0
5
+ model_name: xvla
6
  pipeline_tag: robotics
7
  tags:
 
 
8
  - robotics
9
+ - lerobot
10
+ - xvla
11
  ---
12
 
13
+ # Model Card for xvla
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
+ _Model type not recognized please update this template._
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "type": "diffusion",
3
- "n_obs_steps": 2,
4
  "input_features": {
5
  "observation.state": {
6
  "type": "STATE",
@@ -40,51 +40,176 @@
40
  "private": null,
41
  "tags": null,
42
  "license": null,
43
- "pretrained_path": null,
44
- "horizon": 16,
45
- "n_action_steps": 8,
 
46
  "normalization_mapping": {
47
- "VISUAL": "MEAN_STD",
48
- "STATE": "MIN_MAX",
49
- "ACTION": "MIN_MAX"
50
  },
51
- "drop_n_last_frames": 7,
52
- "vision_backbone": "resnet18",
53
- "crop_shape": [
54
- 84,
55
- 84
56
- ],
57
- "crop_is_random": true,
58
- "pretrained_backbone_weights": null,
59
- "use_group_norm": true,
60
- "spatial_softmax_num_keypoints": 32,
61
- "use_separate_rgb_encoder_per_camera": false,
62
- "down_dims": [
63
- 512,
64
- 1024,
65
- 2048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  ],
67
- "kernel_size": 5,
68
- "n_groups": 8,
69
- "diffusion_step_embed_dim": 128,
70
- "use_film_scale_modulation": true,
71
- "noise_scheduler_type": "DDPM",
72
- "num_train_timesteps": 100,
73
- "beta_schedule": "squaredcos_cap_v2",
74
- "beta_start": 0.0001,
75
- "beta_end": 0.02,
76
- "prediction_type": "epsilon",
77
- "clip_sample": true,
78
- "clip_sample_range": 1.0,
79
- "num_inference_steps": null,
80
- "do_mask_loss_for_padding": false,
81
  "optimizer_lr": 0.0001,
82
  "optimizer_betas": [
83
- 0.95,
84
- 0.999
85
  ],
86
  "optimizer_eps": 1e-08,
87
- "optimizer_weight_decay": 1e-06,
88
- "scheduler_name": "cosine",
89
- "scheduler_warmup_steps": 500
 
 
 
 
90
  }
 
1
  {
2
+ "type": "xvla",
3
+ "n_obs_steps": 1,
4
  "input_features": {
5
  "observation.state": {
6
  "type": "STATE",
 
40
  "private": null,
41
  "tags": null,
42
  "license": null,
43
+ "pretrained_path": "lerobot/xvla-base",
44
+ "chunk_size": 32,
45
+ "n_action_steps": 32,
46
+ "dtype": "float32",
47
  "normalization_mapping": {
48
+ "VISUAL": "IDENTITY",
49
+ "STATE": "IDENTITY",
50
+ "ACTION": "IDENTITY"
51
  },
52
+ "florence_config": {
53
+ "model_type": "florence2",
54
+ "bos_token_id": 0,
55
+ "eos_token_id": 2,
56
+ "ignore_index": -100,
57
+ "pad_token_id": 1,
58
+ "projection_dim": 1024,
59
+ "text_config": {
60
+ "vocab_size": 51289,
61
+ "activation_dropout": 0.1,
62
+ "activation_function": "gelu",
63
+ "add_bias_logits": false,
64
+ "add_final_layer_norm": false,
65
+ "attention_dropout": 0.1,
66
+ "bos_token_id": 0,
67
+ "classif_dropout": 0.1,
68
+ "classifier_dropout": 0.0,
69
+ "d_model": 1024,
70
+ "decoder_attention_heads": 16,
71
+ "decoder_ffn_dim": 4096,
72
+ "decoder_layerdrop": 0.0,
73
+ "decoder_layers": 12,
74
+ "decoder_start_token_id": 2,
75
+ "dropout": 0.1,
76
+ "early_stopping": true,
77
+ "encoder_attention_heads": 16,
78
+ "encoder_ffn_dim": 4096,
79
+ "encoder_layerdrop": 0.0,
80
+ "encoder_layers": 12,
81
+ "eos_token_id": 2,
82
+ "forced_eos_token_id": 2,
83
+ "forced_bos_token_id": 0,
84
+ "gradient_checkpointing": false,
85
+ "init_std": 0.02,
86
+ "is_encoder_decoder": true,
87
+ "label2id": {
88
+ "LABEL_0": 0,
89
+ "LABEL_1": 1,
90
+ "LABEL_2": 2
91
+ },
92
+ "max_position_embeddings": 4096,
93
+ "no_repeat_ngram_size": 3,
94
+ "normalize_before": false,
95
+ "num_hidden_layers": 12,
96
+ "pad_token_id": 1,
97
+ "scale_embedding": false,
98
+ "num_beams": 3
99
+ },
100
+ "vision_config": {
101
+ "model_type": "davit",
102
+ "drop_path_rate": 0.1,
103
+ "patch_size": [
104
+ 7,
105
+ 3,
106
+ 3,
107
+ 3
108
+ ],
109
+ "patch_stride": [
110
+ 4,
111
+ 2,
112
+ 2,
113
+ 2
114
+ ],
115
+ "patch_padding": [
116
+ 3,
117
+ 1,
118
+ 1,
119
+ 1
120
+ ],
121
+ "patch_prenorm": [
122
+ false,
123
+ true,
124
+ true,
125
+ true
126
+ ],
127
+ "enable_checkpoint": false,
128
+ "dim_embed": [
129
+ 256,
130
+ 512,
131
+ 1024,
132
+ 2048
133
+ ],
134
+ "num_heads": [
135
+ 8,
136
+ 16,
137
+ 32,
138
+ 64
139
+ ],
140
+ "num_groups": [
141
+ 8,
142
+ 16,
143
+ 32,
144
+ 64
145
+ ],
146
+ "depths": [
147
+ 1,
148
+ 1,
149
+ 9,
150
+ 1
151
+ ],
152
+ "window_size": 12,
153
+ "projection_dim": 1024,
154
+ "visual_temporal_embedding": {
155
+ "type": "COSINE",
156
+ "max_temporal_embeddings": 100
157
+ },
158
+ "image_pos_embed": {
159
+ "type": "learned_abs_2d",
160
+ "max_pos_embeddings": 50
161
+ },
162
+ "image_feature_source": [
163
+ "spatial_avg_pool",
164
+ "temporal_avg_pool"
165
+ ]
166
+ },
167
+ "vocab_size": 51289,
168
+ "torch_dtype": "float32",
169
+ "is_encoder_decoder": true
170
+ },
171
+ "vision_config": null,
172
+ "text_config": null,
173
+ "tokenizer_name": "facebook/bart-large",
174
+ "tokenizer_max_length": 64,
175
+ "tokenizer_padding_side": "right",
176
+ "pad_language_to": "max_length",
177
+ "hidden_size": 1024,
178
+ "depth": 24,
179
+ "num_heads": 16,
180
+ "mlp_ratio": 4.0,
181
+ "num_domains": 30,
182
+ "len_soft_prompts": 32,
183
+ "dim_time": 32,
184
+ "max_len_seq": 512,
185
+ "use_hetero_proj": false,
186
+ "action_mode": "ee6d",
187
+ "num_denoising_steps": 10,
188
+ "use_proprio": true,
189
+ "max_state_dim": 20,
190
+ "max_action_dim": 20,
191
+ "domain_feature_key": null,
192
+ "resize_imgs_with_padding": [
193
+ 384,
194
+ 384
195
  ],
196
+ "num_image_views": 2,
197
+ "empty_cameras": 0,
198
+ "freeze_vision_encoder": false,
199
+ "freeze_language_encoder": false,
200
+ "train_policy_transformer": true,
201
+ "train_soft_prompts": true,
 
 
 
 
 
 
 
 
202
  "optimizer_lr": 0.0001,
203
  "optimizer_betas": [
204
+ 0.9,
205
+ 0.99
206
  ],
207
  "optimizer_eps": 1e-08,
208
+ "optimizer_weight_decay": 0.0,
209
+ "optimizer_grad_clip_norm": 10.0,
210
+ "optimizer_soft_prompt_lr_scale": 1.0,
211
+ "optimizer_soft_prompt_warmup_lr_scale": null,
212
+ "scheduler_warmup_steps": 1000,
213
+ "scheduler_decay_steps": 30000,
214
+ "scheduler_decay_lr": 2.5e-06
215
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b7f69174642825cf7419ddb92ad24cf3930ccbaa900057e2f1c24fa001a0cbc
3
- size 1066516384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ae464c1ac224e3a72e26a58e74d624b5223745eb9ffa2ef838fd4439a6715e5
3
+ size 3519073692
train_config.json CHANGED
@@ -81,8 +81,8 @@
81
  },
82
  "env": null,
83
  "policy": {
84
- "type": "diffusion",
85
- "n_obs_steps": 2,
86
  "input_features": {
87
  "observation.state": {
88
  "type": "STATE",
@@ -122,60 +122,185 @@
122
  "private": null,
123
  "tags": null,
124
  "license": null,
125
- "pretrained_path": null,
126
- "horizon": 16,
127
- "n_action_steps": 8,
 
128
  "normalization_mapping": {
129
- "VISUAL": "MEAN_STD",
130
- "STATE": "MIN_MAX",
131
- "ACTION": "MIN_MAX"
132
  },
133
- "drop_n_last_frames": 7,
134
- "vision_backbone": "resnet18",
135
- "crop_shape": [
136
- 84,
137
- 84
138
- ],
139
- "crop_is_random": true,
140
- "pretrained_backbone_weights": null,
141
- "use_group_norm": true,
142
- "spatial_softmax_num_keypoints": 32,
143
- "use_separate_rgb_encoder_per_camera": false,
144
- "down_dims": [
145
- 512,
146
- 1024,
147
- 2048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  ],
149
- "kernel_size": 5,
150
- "n_groups": 8,
151
- "diffusion_step_embed_dim": 128,
152
- "use_film_scale_modulation": true,
153
- "noise_scheduler_type": "DDPM",
154
- "num_train_timesteps": 100,
155
- "beta_schedule": "squaredcos_cap_v2",
156
- "beta_start": 0.0001,
157
- "beta_end": 0.02,
158
- "prediction_type": "epsilon",
159
- "clip_sample": true,
160
- "clip_sample_range": 1.0,
161
- "num_inference_steps": null,
162
- "do_mask_loss_for_padding": false,
163
  "optimizer_lr": 0.0001,
164
  "optimizer_betas": [
165
- 0.95,
166
- 0.999
167
  ],
168
  "optimizer_eps": 1e-08,
169
- "optimizer_weight_decay": 1e-06,
170
- "scheduler_name": "cosine",
171
- "scheduler_warmup_steps": 500
 
 
 
 
172
  },
173
- "output_dir": "outputs/train/diff_so101_su_devoile_5ksteps",
174
- "job_name": "act_so101_3cube_1ksteps",
175
  "resume": false,
176
  "seed": 1000,
177
  "num_workers": 4,
178
- "batch_size": 32,
179
  "steps": 1000,
180
  "eval_freq": 20000,
181
  "log_freq": 200,
@@ -183,20 +308,24 @@
183
  "save_freq": 20000,
184
  "use_policy_training_preset": true,
185
  "optimizer": {
186
- "type": "adam",
187
  "lr": 0.0001,
188
- "weight_decay": 1e-06,
189
  "grad_clip_norm": 10.0,
190
  "betas": [
191
- 0.95,
192
- 0.999
193
  ],
194
- "eps": 1e-08
 
 
195
  },
196
  "scheduler": {
197
- "type": "diffuser",
198
- "num_warmup_steps": 500,
199
- "name": "cosine"
 
 
200
  },
201
  "eval": {
202
  "n_episodes": 50,
@@ -209,7 +338,7 @@
209
  "project": "lerobot",
210
  "entity": null,
211
  "notes": null,
212
- "run_id": "6s6dkw3g",
213
  "mode": null
214
  },
215
  "checkpoint_path": null,
 
81
  },
82
  "env": null,
83
  "policy": {
84
+ "type": "xvla",
85
+ "n_obs_steps": 1,
86
  "input_features": {
87
  "observation.state": {
88
  "type": "STATE",
 
122
  "private": null,
123
  "tags": null,
124
  "license": null,
125
+ "pretrained_path": "lerobot/xvla-base",
126
+ "chunk_size": 32,
127
+ "n_action_steps": 32,
128
+ "dtype": "float32",
129
  "normalization_mapping": {
130
+ "VISUAL": "IDENTITY",
131
+ "STATE": "IDENTITY",
132
+ "ACTION": "IDENTITY"
133
  },
134
+ "florence_config": {
135
+ "model_type": "florence2",
136
+ "bos_token_id": 0,
137
+ "eos_token_id": 2,
138
+ "ignore_index": -100,
139
+ "pad_token_id": 1,
140
+ "projection_dim": 1024,
141
+ "text_config": {
142
+ "vocab_size": 51289,
143
+ "activation_dropout": 0.1,
144
+ "activation_function": "gelu",
145
+ "add_bias_logits": false,
146
+ "add_final_layer_norm": false,
147
+ "attention_dropout": 0.1,
148
+ "bos_token_id": 0,
149
+ "classif_dropout": 0.1,
150
+ "classifier_dropout": 0.0,
151
+ "d_model": 1024,
152
+ "decoder_attention_heads": 16,
153
+ "decoder_ffn_dim": 4096,
154
+ "decoder_layerdrop": 0.0,
155
+ "decoder_layers": 12,
156
+ "decoder_start_token_id": 2,
157
+ "dropout": 0.1,
158
+ "early_stopping": true,
159
+ "encoder_attention_heads": 16,
160
+ "encoder_ffn_dim": 4096,
161
+ "encoder_layerdrop": 0.0,
162
+ "encoder_layers": 12,
163
+ "eos_token_id": 2,
164
+ "forced_eos_token_id": 2,
165
+ "forced_bos_token_id": 0,
166
+ "gradient_checkpointing": false,
167
+ "init_std": 0.02,
168
+ "is_encoder_decoder": true,
169
+ "label2id": {
170
+ "LABEL_0": 0,
171
+ "LABEL_1": 1,
172
+ "LABEL_2": 2
173
+ },
174
+ "max_position_embeddings": 4096,
175
+ "no_repeat_ngram_size": 3,
176
+ "normalize_before": false,
177
+ "num_hidden_layers": 12,
178
+ "pad_token_id": 1,
179
+ "scale_embedding": false,
180
+ "num_beams": 3
181
+ },
182
+ "vision_config": {
183
+ "model_type": "davit",
184
+ "drop_path_rate": 0.1,
185
+ "patch_size": [
186
+ 7,
187
+ 3,
188
+ 3,
189
+ 3
190
+ ],
191
+ "patch_stride": [
192
+ 4,
193
+ 2,
194
+ 2,
195
+ 2
196
+ ],
197
+ "patch_padding": [
198
+ 3,
199
+ 1,
200
+ 1,
201
+ 1
202
+ ],
203
+ "patch_prenorm": [
204
+ false,
205
+ true,
206
+ true,
207
+ true
208
+ ],
209
+ "enable_checkpoint": false,
210
+ "dim_embed": [
211
+ 256,
212
+ 512,
213
+ 1024,
214
+ 2048
215
+ ],
216
+ "num_heads": [
217
+ 8,
218
+ 16,
219
+ 32,
220
+ 64
221
+ ],
222
+ "num_groups": [
223
+ 8,
224
+ 16,
225
+ 32,
226
+ 64
227
+ ],
228
+ "depths": [
229
+ 1,
230
+ 1,
231
+ 9,
232
+ 1
233
+ ],
234
+ "window_size": 12,
235
+ "projection_dim": 1024,
236
+ "visual_temporal_embedding": {
237
+ "type": "COSINE",
238
+ "max_temporal_embeddings": 100
239
+ },
240
+ "image_pos_embed": {
241
+ "type": "learned_abs_2d",
242
+ "max_pos_embeddings": 50
243
+ },
244
+ "image_feature_source": [
245
+ "spatial_avg_pool",
246
+ "temporal_avg_pool"
247
+ ]
248
+ },
249
+ "vocab_size": 51289,
250
+ "torch_dtype": "float32",
251
+ "is_encoder_decoder": true
252
+ },
253
+ "vision_config": null,
254
+ "text_config": null,
255
+ "tokenizer_name": "facebook/bart-large",
256
+ "tokenizer_max_length": 64,
257
+ "tokenizer_padding_side": "right",
258
+ "pad_language_to": "max_length",
259
+ "hidden_size": 1024,
260
+ "depth": 24,
261
+ "num_heads": 16,
262
+ "mlp_ratio": 4.0,
263
+ "num_domains": 30,
264
+ "len_soft_prompts": 32,
265
+ "dim_time": 32,
266
+ "max_len_seq": 512,
267
+ "use_hetero_proj": false,
268
+ "action_mode": "ee6d",
269
+ "num_denoising_steps": 10,
270
+ "use_proprio": true,
271
+ "max_state_dim": 20,
272
+ "max_action_dim": 20,
273
+ "domain_feature_key": null,
274
+ "resize_imgs_with_padding": [
275
+ 384,
276
+ 384
277
  ],
278
+ "num_image_views": 2,
279
+ "empty_cameras": 0,
280
+ "freeze_vision_encoder": false,
281
+ "freeze_language_encoder": false,
282
+ "train_policy_transformer": true,
283
+ "train_soft_prompts": true,
 
 
 
 
 
 
 
 
284
  "optimizer_lr": 0.0001,
285
  "optimizer_betas": [
286
+ 0.9,
287
+ 0.99
288
  ],
289
  "optimizer_eps": 1e-08,
290
+ "optimizer_weight_decay": 0.0,
291
+ "optimizer_grad_clip_norm": 10.0,
292
+ "optimizer_soft_prompt_lr_scale": 1.0,
293
+ "optimizer_soft_prompt_warmup_lr_scale": null,
294
+ "scheduler_warmup_steps": 1000,
295
+ "scheduler_decay_steps": 30000,
296
+ "scheduler_decay_lr": 2.5e-06
297
  },
298
+ "output_dir": "outputs/train/xvla_finetune_4",
299
+ "job_name": "xvla_finetune_4",
300
  "resume": false,
301
  "seed": 1000,
302
  "num_workers": 4,
303
+ "batch_size": 16,
304
  "steps": 1000,
305
  "eval_freq": 20000,
306
  "log_freq": 200,
 
308
  "save_freq": 20000,
309
  "use_policy_training_preset": true,
310
  "optimizer": {
311
+ "type": "xvla-adamw",
312
  "lr": 0.0001,
313
+ "weight_decay": 0.0,
314
  "grad_clip_norm": 10.0,
315
  "betas": [
316
+ 0.9,
317
+ 0.99
318
  ],
319
+ "eps": 1e-08,
320
+ "soft_prompt_lr_scale": 1.0,
321
+ "soft_prompt_warmup_lr_scale": null
322
  },
323
  "scheduler": {
324
+ "type": "cosine_decay_with_warmup",
325
+ "num_warmup_steps": 1000,
326
+ "num_decay_steps": 30000,
327
+ "peak_lr": 0.0001,
328
+ "decay_lr": 2.5e-06
329
  },
330
  "eval": {
331
  "n_episodes": 50,
 
338
  "project": "lerobot",
339
  "entity": null,
340
  "notes": null,
341
+ "run_id": "chzzy5r6",
342
  "mode": null
343
  },
344
  "checkpoint_path": null,