Robotics
LeRobot
Safetensors
smolvla
duys0304 commited on
Commit
cf68dfb
·
verified ·
1 Parent(s): 9d0ac02

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +6 -5
  2. config.json +54 -47
  3. model.safetensors +2 -2
  4. train_config.json +82 -75
README.md CHANGED
@@ -1,21 +1,22 @@
1
  ---
2
- datasets: lerobot/pusht
 
3
  library_name: lerobot
4
  license: apache-2.0
5
- model_name: diffusion
6
  pipeline_tag: robotics
7
  tags:
8
  - lerobot
 
9
  - robotics
10
- - diffusion
11
  ---
12
 
13
- # Model Card for diffusion
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
- [Diffusion Policy](https://huggingface.co/papers/2303.04137) treats visuomotor control as a generative diffusion process, producing smooth, multi-step action trajectories that excel at contact-rich manipulation.
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
1
  ---
2
+ base_model: lerobot/smolvla_base
3
+ datasets: lerobot/svla_so101_pickplace
4
  library_name: lerobot
5
  license: apache-2.0
6
+ model_name: smolvla
7
  pipeline_tag: robotics
8
  tags:
9
  - lerobot
10
+ - smolvla
11
  - robotics
 
12
  ---
13
 
14
+ # Model Card for smolvla
15
 
16
  <!-- Provide a quick summary of what the model is/does. -->
17
 
18
 
19
+ [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
 
21
 
22
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
config.json CHANGED
@@ -1,24 +1,32 @@
1
  {
2
- "type": "diffusion",
3
- "n_obs_steps": 2,
4
  "normalization_mapping": {
5
- "VISUAL": "MEAN_STD",
6
- "STATE": "MIN_MAX",
7
- "ACTION": "MIN_MAX"
8
  },
9
  "input_features": {
10
- "observation.image": {
 
 
 
 
 
 
11
  "type": "VISUAL",
12
  "shape": [
13
  3,
14
- 96,
15
- 96
16
  ]
17
  },
18
- "observation.state": {
19
- "type": "STATE",
20
  "shape": [
21
- 2
 
 
22
  ]
23
  }
24
  },
@@ -26,7 +34,7 @@
26
  "action": {
27
  "type": "ACTION",
28
  "shape": [
29
- 2
30
  ]
31
  }
32
  },
@@ -37,45 +45,44 @@
37
  "private": null,
38
  "tags": null,
39
  "license": null,
40
- "horizon": 16,
41
- "n_action_steps": 8,
42
- "drop_n_last_frames": 7,
43
- "vision_backbone": "resnet18",
44
- "crop_shape": [
45
- 84,
46
- 84
47
- ],
48
- "crop_is_random": true,
49
- "pretrained_backbone_weights": null,
50
- "use_group_norm": true,
51
- "spatial_softmax_num_keypoints": 32,
52
- "use_separate_rgb_encoder_per_camera": false,
53
- "down_dims": [
54
  512,
55
- 1024,
56
- 2048
57
  ],
58
- "kernel_size": 5,
59
- "n_groups": 8,
60
- "diffusion_step_embed_dim": 128,
61
- "use_film_scale_modulation": true,
62
- "noise_scheduler_type": "DDPM",
63
- "num_train_timesteps": 100,
64
- "beta_schedule": "squaredcos_cap_v2",
65
- "beta_start": 0.0001,
66
- "beta_end": 0.02,
67
- "prediction_type": "epsilon",
68
- "clip_sample": true,
69
- "clip_sample_range": 1.0,
70
- "num_inference_steps": null,
71
- "do_mask_loss_for_padding": false,
72
  "optimizer_lr": 0.0001,
73
  "optimizer_betas": [
74
- 0.95,
75
- 0.999
76
  ],
77
  "optimizer_eps": 1e-08,
78
- "optimizer_weight_decay": 1e-06,
79
- "scheduler_name": "cosine",
80
- "scheduler_warmup_steps": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  }
 
1
  {
2
+ "type": "smolvla",
3
+ "n_obs_steps": 1,
4
  "normalization_mapping": {
5
+ "VISUAL": "IDENTITY",
6
+ "STATE": "MEAN_STD",
7
+ "ACTION": "MEAN_STD"
8
  },
9
  "input_features": {
10
+ "observation.state": {
11
+ "type": "STATE",
12
+ "shape": [
13
+ 6
14
+ ]
15
+ },
16
+ "observation.images.up": {
17
  "type": "VISUAL",
18
  "shape": [
19
  3,
20
+ 480,
21
+ 640
22
  ]
23
  },
24
+ "observation.images.side": {
25
+ "type": "VISUAL",
26
  "shape": [
27
+ 3,
28
+ 480,
29
+ 640
30
  ]
31
  }
32
  },
 
34
  "action": {
35
  "type": "ACTION",
36
  "shape": [
37
+ 6
38
  ]
39
  }
40
  },
 
45
  "private": null,
46
  "tags": null,
47
  "license": null,
48
+ "chunk_size": 50,
49
+ "n_action_steps": 50,
50
+ "max_state_dim": 32,
51
+ "max_action_dim": 32,
52
+ "resize_imgs_with_padding": [
 
 
 
 
 
 
 
 
 
53
  512,
54
+ 512
 
55
  ],
56
+ "empty_cameras": 0,
57
+ "adapt_to_pi_aloha": false,
58
+ "use_delta_joint_actions_aloha": false,
59
+ "tokenizer_max_length": 48,
60
+ "num_steps": 10,
61
+ "use_cache": true,
62
+ "freeze_vision_encoder": true,
63
+ "train_expert_only": true,
64
+ "train_state_proj": true,
 
 
 
 
 
65
  "optimizer_lr": 0.0001,
66
  "optimizer_betas": [
67
+ 0.9,
68
+ 0.95
69
  ],
70
  "optimizer_eps": 1e-08,
71
+ "optimizer_weight_decay": 1e-10,
72
+ "optimizer_grad_clip_norm": 10.0,
73
+ "scheduler_warmup_steps": 1000,
74
+ "scheduler_decay_steps": 30000,
75
+ "scheduler_decay_lr": 2.5e-06,
76
+ "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
77
+ "load_vlm_weights": true,
78
+ "add_image_special_tokens": false,
79
+ "attention_mode": "cross_attn",
80
+ "prefix_length": 0,
81
+ "pad_language_to": "max_length",
82
+ "num_expert_layers": 0,
83
+ "num_vlm_layers": 16,
84
+ "self_attn_every_n_layers": 2,
85
+ "expert_width_multiplier": 0.75,
86
+ "min_period": 0.004,
87
+ "max_period": 4.0
88
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05262f4418ffa05d83a7c56223f692af78d371209e569ec083fb93309fe18f95
3
- size 1050862408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19e009e324f69dac2013f6bfd7bafeef2975370e2d446c2119fe175cac3ba0c9
3
+ size 906713296
train_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "dataset": {
3
- "repo_id": "lerobot/pusht",
4
  "root": null,
5
  "episodes": null,
6
  "image_transforms": {
@@ -65,27 +65,27 @@
65
  "video_backend": "torchcodec"
66
  },
67
  "env": {
68
- "type": "pusht",
69
- "task": "PushT-v0",
70
- "fps": 10,
71
  "features": {
72
  "action": {
73
  "type": "ACTION",
74
  "shape": [
75
- 2
76
  ]
77
  },
78
  "agent_pos": {
79
  "type": "STATE",
80
  "shape": [
81
- 2
82
  ]
83
  },
84
- "pixels": {
85
  "type": "VISUAL",
86
  "shape": [
87
- 384,
88
- 384,
89
  3
90
  ]
91
  }
@@ -93,36 +93,42 @@
93
  "features_map": {
94
  "action": "action",
95
  "agent_pos": "observation.state",
96
- "environment_state": "observation.environment_state",
97
- "pixels": "observation.image"
98
  },
99
- "episode_length": 300,
100
  "obs_type": "pixels_agent_pos",
101
- "render_mode": "rgb_array",
102
- "visualization_width": 384,
103
- "visualization_height": 384
104
  },
105
  "policy": {
106
- "type": "diffusion",
107
- "n_obs_steps": 2,
108
  "normalization_mapping": {
109
- "VISUAL": "MEAN_STD",
110
- "STATE": "MIN_MAX",
111
- "ACTION": "MIN_MAX"
112
  },
113
  "input_features": {
114
- "observation.image": {
 
 
 
 
 
 
115
  "type": "VISUAL",
116
  "shape": [
117
  3,
118
- 96,
119
- 96
120
  ]
121
  },
122
- "observation.state": {
123
- "type": "STATE",
124
  "shape": [
125
- 2
 
 
126
  ]
127
  }
128
  },
@@ -130,7 +136,7 @@
130
  "action": {
131
  "type": "ACTION",
132
  "shape": [
133
- 2
134
  ]
135
  }
136
  },
@@ -141,75 +147,76 @@
141
  "private": null,
142
  "tags": null,
143
  "license": null,
144
- "horizon": 16,
145
- "n_action_steps": 8,
146
- "drop_n_last_frames": 7,
147
- "vision_backbone": "resnet18",
148
- "crop_shape": [
149
- 84,
150
- 84
151
- ],
152
- "crop_is_random": true,
153
- "pretrained_backbone_weights": null,
154
- "use_group_norm": true,
155
- "spatial_softmax_num_keypoints": 32,
156
- "use_separate_rgb_encoder_per_camera": false,
157
- "down_dims": [
158
  512,
159
- 1024,
160
- 2048
161
  ],
162
- "kernel_size": 5,
163
- "n_groups": 8,
164
- "diffusion_step_embed_dim": 128,
165
- "use_film_scale_modulation": true,
166
- "noise_scheduler_type": "DDPM",
167
- "num_train_timesteps": 100,
168
- "beta_schedule": "squaredcos_cap_v2",
169
- "beta_start": 0.0001,
170
- "beta_end": 0.02,
171
- "prediction_type": "epsilon",
172
- "clip_sample": true,
173
- "clip_sample_range": 1.0,
174
- "num_inference_steps": null,
175
- "do_mask_loss_for_padding": false,
176
  "optimizer_lr": 0.0001,
177
  "optimizer_betas": [
178
- 0.95,
179
- 0.999
180
  ],
181
  "optimizer_eps": 1e-08,
182
- "optimizer_weight_decay": 1e-06,
183
- "scheduler_name": "cosine",
184
- "scheduler_warmup_steps": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  },
186
- "output_dir": "outputs/train/2025-07-10/12-44-21_pusht_diffusion",
187
- "job_name": "pusht_diffusion",
188
  "resume": false,
189
  "seed": 1000,
190
  "num_workers": 4,
191
- "batch_size": 8,
192
- "steps": 100000,
193
  "eval_freq": 20000,
194
  "log_freq": 200,
195
  "save_checkpoint": true,
196
  "save_freq": 20000,
197
  "use_policy_training_preset": true,
198
  "optimizer": {
199
- "type": "adam",
200
  "lr": 0.0001,
201
- "weight_decay": 1e-06,
202
  "grad_clip_norm": 10.0,
203
  "betas": [
204
- 0.95,
205
- 0.999
206
  ],
207
  "eps": 1e-08
208
  },
209
  "scheduler": {
210
- "type": "diffuser",
211
- "num_warmup_steps": 500,
212
- "name": "cosine"
 
 
213
  },
214
  "eval": {
215
  "n_episodes": 50,
@@ -217,12 +224,12 @@
217
  "use_async_envs": false
218
  },
219
  "wandb": {
220
- "enable": false,
221
  "disable_artifact": false,
222
  "project": "lerobot",
223
  "entity": null,
224
  "notes": null,
225
- "run_id": null,
226
  "mode": null
227
  }
228
  }
 
1
  {
2
  "dataset": {
3
+ "repo_id": "lerobot/svla_so101_pickplace",
4
  "root": null,
5
  "episodes": null,
6
  "image_transforms": {
 
65
  "video_backend": "torchcodec"
66
  },
67
  "env": {
68
+ "type": "aloha",
69
+ "task": "AlohaInsertion-v0",
70
+ "fps": 50,
71
  "features": {
72
  "action": {
73
  "type": "ACTION",
74
  "shape": [
75
+ 14
76
  ]
77
  },
78
  "agent_pos": {
79
  "type": "STATE",
80
  "shape": [
81
+ 14
82
  ]
83
  },
84
+ "pixels/top": {
85
  "type": "VISUAL",
86
  "shape": [
87
+ 480,
88
+ 640,
89
  3
90
  ]
91
  }
 
93
  "features_map": {
94
  "action": "action",
95
  "agent_pos": "observation.state",
96
+ "top": "observation.image.top",
97
+ "pixels/top": "observation.images.top"
98
  },
99
+ "episode_length": 400,
100
  "obs_type": "pixels_agent_pos",
101
+ "render_mode": "rgb_array"
 
 
102
  },
103
  "policy": {
104
+ "type": "smolvla",
105
+ "n_obs_steps": 1,
106
  "normalization_mapping": {
107
+ "VISUAL": "IDENTITY",
108
+ "STATE": "MEAN_STD",
109
+ "ACTION": "MEAN_STD"
110
  },
111
  "input_features": {
112
+ "observation.state": {
113
+ "type": "STATE",
114
+ "shape": [
115
+ 6
116
+ ]
117
+ },
118
+ "observation.images.up": {
119
  "type": "VISUAL",
120
  "shape": [
121
  3,
122
+ 480,
123
+ 640
124
  ]
125
  },
126
+ "observation.images.side": {
127
+ "type": "VISUAL",
128
  "shape": [
129
+ 3,
130
+ 480,
131
+ 640
132
  ]
133
  }
134
  },
 
136
  "action": {
137
  "type": "ACTION",
138
  "shape": [
139
+ 6
140
  ]
141
  }
142
  },
 
147
  "private": null,
148
  "tags": null,
149
  "license": null,
150
+ "chunk_size": 50,
151
+ "n_action_steps": 50,
152
+ "max_state_dim": 32,
153
+ "max_action_dim": 32,
154
+ "resize_imgs_with_padding": [
 
 
 
 
 
 
 
 
 
155
  512,
156
+ 512
 
157
  ],
158
+ "empty_cameras": 0,
159
+ "adapt_to_pi_aloha": false,
160
+ "use_delta_joint_actions_aloha": false,
161
+ "tokenizer_max_length": 48,
162
+ "num_steps": 10,
163
+ "use_cache": true,
164
+ "freeze_vision_encoder": true,
165
+ "train_expert_only": true,
166
+ "train_state_proj": true,
 
 
 
 
 
167
  "optimizer_lr": 0.0001,
168
  "optimizer_betas": [
169
+ 0.9,
170
+ 0.95
171
  ],
172
  "optimizer_eps": 1e-08,
173
+ "optimizer_weight_decay": 1e-10,
174
+ "optimizer_grad_clip_norm": 10.0,
175
+ "scheduler_warmup_steps": 1000,
176
+ "scheduler_decay_steps": 30000,
177
+ "scheduler_decay_lr": 2.5e-06,
178
+ "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
179
+ "load_vlm_weights": true,
180
+ "add_image_special_tokens": false,
181
+ "attention_mode": "cross_attn",
182
+ "prefix_length": 0,
183
+ "pad_language_to": "max_length",
184
+ "num_expert_layers": 0,
185
+ "num_vlm_layers": 16,
186
+ "self_attn_every_n_layers": 2,
187
+ "expert_width_multiplier": 0.75,
188
+ "min_period": 0.004,
189
+ "max_period": 4.0
190
  },
191
+ "output_dir": "outputs/train/so101_pickplace",
192
+ "job_name": "my_smolvla_training",
193
  "resume": false,
194
  "seed": 1000,
195
  "num_workers": 4,
196
+ "batch_size": 32,
197
+ "steps": 200,
198
  "eval_freq": 20000,
199
  "log_freq": 200,
200
  "save_checkpoint": true,
201
  "save_freq": 20000,
202
  "use_policy_training_preset": true,
203
  "optimizer": {
204
+ "type": "adamw",
205
  "lr": 0.0001,
206
+ "weight_decay": 1e-10,
207
  "grad_clip_norm": 10.0,
208
  "betas": [
209
+ 0.9,
210
+ 0.95
211
  ],
212
  "eps": 1e-08
213
  },
214
  "scheduler": {
215
+ "type": "cosine_decay_with_warmup",
216
+ "num_warmup_steps": 1000,
217
+ "num_decay_steps": 30000,
218
+ "peak_lr": 0.0001,
219
+ "decay_lr": 2.5e-06
220
  },
221
  "eval": {
222
  "n_episodes": 50,
 
224
  "use_async_envs": false
225
  },
226
  "wandb": {
227
+ "enable": true,
228
  "disable_artifact": false,
229
  "project": "lerobot",
230
  "entity": null,
231
  "notes": null,
232
+ "run_id": "m3dkvyt8",
233
  "mode": null
234
  }
235
  }