mfyu commited on
Commit
b63c559
·
verified ·
1 Parent(s): 6c91370

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +5 -4
  2. config.json +59 -32
  3. model.safetensors +2 -2
  4. train_config.json +80 -44
README.md CHANGED
@@ -1,21 +1,22 @@
1
  ---
 
2
  datasets: mfyu/piper-make-panda
3
  library_name: lerobot
4
  license: apache-2.0
5
- model_name: act
6
  pipeline_tag: robotics
7
  tags:
8
- - act
9
  - robotics
 
10
  - lerobot
11
  ---
12
 
13
- # Model Card for act
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
- [Action Chunking with Transformers (ACT)](https://huggingface.co/papers/2304.13705) is an imitation-learning method that predicts short action chunks instead of single steps. It learns from teleoperated data and often achieves high success rates.
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
1
  ---
2
+ base_model: lerobot/smolvla_base
3
  datasets: mfyu/piper-make-panda
4
  library_name: lerobot
5
  license: apache-2.0
6
+ model_name: smolvla
7
  pipeline_tag: robotics
8
  tags:
 
9
  - robotics
10
+ - smolvla
11
  - lerobot
12
  ---
13
 
14
+ # Model Card for smolvla
15
 
16
  <!-- Provide a quick summary of what the model is/does. -->
17
 
18
 
19
+ [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
 
21
 
22
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
config.json CHANGED
@@ -1,27 +1,35 @@
1
  {
2
- "type": "act",
3
  "n_obs_steps": 1,
4
  "input_features": {
5
  "observation.state": {
6
  "type": "STATE",
7
  "shape": [
8
- 8
9
  ]
10
  },
11
- "observation.images.head": {
12
  "type": "VISUAL",
13
  "shape": [
14
  3,
15
- 480,
16
- 640
17
  ]
18
  },
19
- "observation.images.hand": {
20
  "type": "VISUAL",
21
  "shape": [
22
  3,
23
- 480,
24
- 640
 
 
 
 
 
 
 
 
25
  ]
26
  }
27
  },
@@ -29,7 +37,7 @@
29
  "action": {
30
  "type": "ACTION",
31
  "shape": [
32
- 7
33
  ]
34
  }
35
  },
@@ -40,31 +48,50 @@
40
  "private": null,
41
  "tags": null,
42
  "license": null,
43
- "pretrained_path": null,
44
- "chunk_size": 100,
45
- "n_action_steps": 100,
46
  "normalization_mapping": {
47
- "VISUAL": "MEAN_STD",
48
  "STATE": "MEAN_STD",
49
  "ACTION": "MEAN_STD"
50
  },
51
- "vision_backbone": "resnet18",
52
- "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
53
- "replace_final_stride_with_dilation": false,
54
- "pre_norm": false,
55
- "dim_model": 512,
56
- "n_heads": 8,
57
- "dim_feedforward": 3200,
58
- "feedforward_activation": "relu",
59
- "n_encoder_layers": 4,
60
- "n_decoder_layers": 1,
61
- "use_vae": true,
62
- "latent_dim": 32,
63
- "n_vae_encoder_layers": 4,
64
- "temporal_ensemble_coeff": null,
65
- "dropout": 0.1,
66
- "kl_weight": 10.0,
67
- "optimizer_lr": 1e-05,
68
- "optimizer_weight_decay": 0.0001,
69
- "optimizer_lr_backbone": 1e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  }
 
1
  {
2
+ "type": "smolvla",
3
  "n_obs_steps": 1,
4
  "input_features": {
5
  "observation.state": {
6
  "type": "STATE",
7
  "shape": [
8
+ 6
9
  ]
10
  },
11
+ "observation.images.camera1": {
12
  "type": "VISUAL",
13
  "shape": [
14
  3,
15
+ 256,
16
+ 256
17
  ]
18
  },
19
+ "observation.images.camera2": {
20
  "type": "VISUAL",
21
  "shape": [
22
  3,
23
+ 256,
24
+ 256
25
+ ]
26
+ },
27
+ "observation.images.camera3": {
28
+ "type": "VISUAL",
29
+ "shape": [
30
+ 3,
31
+ 256,
32
+ 256
33
  ]
34
  }
35
  },
 
37
  "action": {
38
  "type": "ACTION",
39
  "shape": [
40
+ 6
41
  ]
42
  }
43
  },
 
48
  "private": null,
49
  "tags": null,
50
  "license": null,
51
+ "pretrained_path": "lerobot/smolvla_base",
52
+ "chunk_size": 50,
53
+ "n_action_steps": 50,
54
  "normalization_mapping": {
55
+ "VISUAL": "IDENTITY",
56
  "STATE": "MEAN_STD",
57
  "ACTION": "MEAN_STD"
58
  },
59
+ "max_state_dim": 32,
60
+ "max_action_dim": 32,
61
+ "resize_imgs_with_padding": [
62
+ 512,
63
+ 512
64
+ ],
65
+ "empty_cameras": 0,
66
+ "adapt_to_pi_aloha": false,
67
+ "use_delta_joint_actions_aloha": false,
68
+ "tokenizer_max_length": 48,
69
+ "num_steps": 10,
70
+ "use_cache": true,
71
+ "freeze_vision_encoder": true,
72
+ "train_expert_only": true,
73
+ "train_state_proj": true,
74
+ "optimizer_lr": 0.0001,
75
+ "optimizer_betas": [
76
+ 0.9,
77
+ 0.95
78
+ ],
79
+ "optimizer_eps": 1e-08,
80
+ "optimizer_weight_decay": 1e-10,
81
+ "optimizer_grad_clip_norm": 10.0,
82
+ "scheduler_warmup_steps": 1000,
83
+ "scheduler_decay_steps": 30000,
84
+ "scheduler_decay_lr": 2.5e-06,
85
+ "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
86
+ "load_vlm_weights": true,
87
+ "add_image_special_tokens": false,
88
+ "attention_mode": "cross_attn",
89
+ "prefix_length": 0,
90
+ "pad_language_to": "max_length",
91
+ "num_expert_layers": 0,
92
+ "num_vlm_layers": 16,
93
+ "self_attn_every_n_layers": 2,
94
+ "expert_width_multiplier": 0.75,
95
+ "min_period": 0.004,
96
+ "max_period": 4.0
97
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45c08d9e8b7573a64c5b91057215781a9c6b6048f3bd7b4331f2dd5717ab15cd
3
- size 206712028
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3719e7bb1b08c84ae232f3863bc2366498496c11f7a908a0945f9054b9a9ee3
3
+ size 906712520
train_config.json CHANGED
@@ -81,29 +81,37 @@
81
  },
82
  "env": null,
83
  "policy": {
84
- "type": "act",
85
  "n_obs_steps": 1,
86
  "input_features": {
87
  "observation.state": {
88
  "type": "STATE",
89
  "shape": [
90
- 8
91
  ]
92
  },
93
- "observation.images.head": {
94
  "type": "VISUAL",
95
  "shape": [
96
  3,
97
- 480,
98
- 640
99
  ]
100
  },
101
- "observation.images.hand": {
102
  "type": "VISUAL",
103
  "shape": [
104
  3,
105
- 480,
106
- 640
 
 
 
 
 
 
 
 
107
  ]
108
  }
109
  },
@@ -111,7 +119,7 @@
111
  "action": {
112
  "type": "ACTION",
113
  "shape": [
114
- 7
115
  ]
116
  }
117
  },
@@ -122,58 +130,83 @@
122
  "private": null,
123
  "tags": null,
124
  "license": null,
125
- "pretrained_path": null,
126
- "chunk_size": 100,
127
- "n_action_steps": 100,
128
  "normalization_mapping": {
129
- "VISUAL": "MEAN_STD",
130
  "STATE": "MEAN_STD",
131
  "ACTION": "MEAN_STD"
132
  },
133
- "vision_backbone": "resnet18",
134
- "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
135
- "replace_final_stride_with_dilation": false,
136
- "pre_norm": false,
137
- "dim_model": 512,
138
- "n_heads": 8,
139
- "dim_feedforward": 3200,
140
- "feedforward_activation": "relu",
141
- "n_encoder_layers": 4,
142
- "n_decoder_layers": 1,
143
- "use_vae": true,
144
- "latent_dim": 32,
145
- "n_vae_encoder_layers": 4,
146
- "temporal_ensemble_coeff": null,
147
- "dropout": 0.1,
148
- "kl_weight": 10.0,
149
- "optimizer_lr": 1e-05,
150
- "optimizer_weight_decay": 0.0001,
151
- "optimizer_lr_backbone": 1e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  },
153
- "output_dir": "outputs/train/act_piper_make_panda_1028_nums_15",
154
- "job_name": "act_piper_make_panda",
155
  "resume": false,
156
  "seed": 1000,
157
  "num_workers": 4,
158
- "batch_size": 8,
159
- "steps": 100000,
160
  "eval_freq": 20000,
161
  "log_freq": 200,
162
  "save_checkpoint": true,
163
- "save_freq": 20000,
164
  "use_policy_training_preset": true,
165
  "optimizer": {
166
  "type": "adamw",
167
- "lr": 1e-05,
168
- "weight_decay": 0.0001,
169
  "grad_clip_norm": 10.0,
170
  "betas": [
171
  0.9,
172
- 0.999
173
  ],
174
  "eps": 1e-08
175
  },
176
- "scheduler": null,
 
 
 
 
 
 
177
  "eval": {
178
  "n_episodes": 50,
179
  "batch_size": 50,
@@ -182,12 +215,15 @@
182
  "wandb": {
183
  "enable": true,
184
  "disable_artifact": false,
185
- "project": "lerobot-piper",
186
  "entity": null,
187
  "notes": null,
188
- "run_id": "xi0lo9wm",
189
  "mode": null
190
  },
191
  "checkpoint_path": null,
192
- "rename_map": {}
 
 
 
193
  }
 
81
  },
82
  "env": null,
83
  "policy": {
84
+ "type": "smolvla",
85
  "n_obs_steps": 1,
86
  "input_features": {
87
  "observation.state": {
88
  "type": "STATE",
89
  "shape": [
90
+ 6
91
  ]
92
  },
93
+ "observation.images.camera1": {
94
  "type": "VISUAL",
95
  "shape": [
96
  3,
97
+ 256,
98
+ 256
99
  ]
100
  },
101
+ "observation.images.camera2": {
102
  "type": "VISUAL",
103
  "shape": [
104
  3,
105
+ 256,
106
+ 256
107
+ ]
108
+ },
109
+ "observation.images.camera3": {
110
+ "type": "VISUAL",
111
+ "shape": [
112
+ 3,
113
+ 256,
114
+ 256
115
  ]
116
  }
117
  },
 
119
  "action": {
120
  "type": "ACTION",
121
  "shape": [
122
+ 6
123
  ]
124
  }
125
  },
 
130
  "private": null,
131
  "tags": null,
132
  "license": null,
133
+ "pretrained_path": "lerobot/smolvla_base",
134
+ "chunk_size": 50,
135
+ "n_action_steps": 50,
136
  "normalization_mapping": {
137
+ "VISUAL": "IDENTITY",
138
  "STATE": "MEAN_STD",
139
  "ACTION": "MEAN_STD"
140
  },
141
+ "max_state_dim": 32,
142
+ "max_action_dim": 32,
143
+ "resize_imgs_with_padding": [
144
+ 512,
145
+ 512
146
+ ],
147
+ "empty_cameras": 0,
148
+ "adapt_to_pi_aloha": false,
149
+ "use_delta_joint_actions_aloha": false,
150
+ "tokenizer_max_length": 48,
151
+ "num_steps": 10,
152
+ "use_cache": true,
153
+ "freeze_vision_encoder": true,
154
+ "train_expert_only": true,
155
+ "train_state_proj": true,
156
+ "optimizer_lr": 0.0001,
157
+ "optimizer_betas": [
158
+ 0.9,
159
+ 0.95
160
+ ],
161
+ "optimizer_eps": 1e-08,
162
+ "optimizer_weight_decay": 1e-10,
163
+ "optimizer_grad_clip_norm": 10.0,
164
+ "scheduler_warmup_steps": 1000,
165
+ "scheduler_decay_steps": 30000,
166
+ "scheduler_decay_lr": 2.5e-06,
167
+ "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
168
+ "load_vlm_weights": true,
169
+ "add_image_special_tokens": false,
170
+ "attention_mode": "cross_attn",
171
+ "prefix_length": 0,
172
+ "pad_language_to": "max_length",
173
+ "num_expert_layers": 0,
174
+ "num_vlm_layers": 16,
175
+ "self_attn_every_n_layers": 2,
176
+ "expert_width_multiplier": 0.75,
177
+ "min_period": 0.004,
178
+ "max_period": 4.0
179
  },
180
+ "output_dir": "outputs/train/my_smolvla",
181
+ "job_name": "my_smolvla_training",
182
  "resume": false,
183
  "seed": 1000,
184
  "num_workers": 4,
185
+ "batch_size": 64,
186
+ "steps": 20000,
187
  "eval_freq": 20000,
188
  "log_freq": 200,
189
  "save_checkpoint": true,
190
+ "save_freq": 5000,
191
  "use_policy_training_preset": true,
192
  "optimizer": {
193
  "type": "adamw",
194
+ "lr": 0.0001,
195
+ "weight_decay": 1e-10,
196
  "grad_clip_norm": 10.0,
197
  "betas": [
198
  0.9,
199
+ 0.95
200
  ],
201
  "eps": 1e-08
202
  },
203
+ "scheduler": {
204
+ "type": "cosine_decay_with_warmup",
205
+ "num_warmup_steps": 1000,
206
+ "num_decay_steps": 30000,
207
+ "peak_lr": 0.0001,
208
+ "decay_lr": 2.5e-06
209
+ },
210
  "eval": {
211
  "n_episodes": 50,
212
  "batch_size": 50,
 
215
  "wandb": {
216
  "enable": true,
217
  "disable_artifact": false,
218
+ "project": "lerobot",
219
  "entity": null,
220
  "notes": null,
221
+ "run_id": "wbgf315f",
222
  "mode": null
223
  },
224
  "checkpoint_path": null,
225
+ "rename_map": {
226
+ "observation.images.head": "observation.images.camera1",
227
+ "observation.images.hand": "observation.images.camera2"
228
+ }
229
  }