Robotics
LeRobot
Safetensors
smolvla
kobikelemen commited on
Commit
15d7600
·
verified ·
1 Parent(s): 54ae9fc

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +63 -0
  2. config.json +12 -36
  3. model.safetensors +2 -2
  4. train_config.json +16 -40
README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: lerobot/smolvla_base
3
+ datasets: kobikelemen/towel_fold_trimmed
4
+ library_name: lerobot
5
+ license: apache-2.0
6
+ model_name: smolvla
7
+ pipeline_tag: robotics
8
+ tags:
9
+ - smolvla
10
+ - robotics
11
+ - lerobot
12
+ ---
13
+
14
+ # Model Card for smolvla
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+ [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
+
21
+
22
+ This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
23
+ See the full documentation at [LeRobot Docs](https://huggingface.co/docs/lerobot/index).
24
+
25
+ ---
26
+
27
+ ## How to Get Started with the Model
28
+
29
+ For a complete walkthrough, see the [training guide](https://huggingface.co/docs/lerobot/il_robots#train-a-policy).
30
+ Below is the short version on how to train and run inference/eval:
31
+
32
+ ### Train from scratch
33
+
34
+ ```bash
35
+ lerobot-train \
36
+ --dataset.repo_id=${HF_USER}/<dataset> \
37
+ --policy.type=act \
38
+ --output_dir=outputs/train/<desired_policy_repo_id> \
39
+ --job_name=lerobot_training \
40
+ --policy.device=cuda \
41
+ --policy.repo_id=${HF_USER}/<desired_policy_repo_id>
42
+ --wandb.enable=true
43
+ ```
44
+
45
+ _Writes checkpoints to `outputs/train/<desired_policy_repo_id>/checkpoints/`._
46
+
47
+ ### Evaluate the policy/run inference
48
+
49
+ ```bash
50
+ lerobot-record \
51
+ --robot.type=so100_follower \
52
+ --dataset.repo_id=<hf_user>/eval_<dataset> \
53
+ --policy.path=<hf_user>/<desired_policy_repo_id> \
54
+ --episodes=10
55
+ ```
56
+
57
+ Prefix the dataset repo with **eval\_** and supply `--policy.path` pointing to a local or hub checkpoint.
58
+
59
+ ---
60
+
61
+ ## Model Details
62
+
63
+ - **License:** apache-2.0
config.json CHANGED
@@ -8,44 +8,20 @@
8
  6
9
  ]
10
  },
11
- "observation.images.camera1": {
12
- "type": "VISUAL",
13
- "shape": [
14
- 3,
15
- 256,
16
- 256
17
- ]
18
- },
19
- "observation.images.camera2": {
20
- "type": "VISUAL",
21
- "shape": [
22
- 3,
23
- 256,
24
- 256
25
- ]
26
- },
27
- "observation.images.camera3": {
28
- "type": "VISUAL",
29
- "shape": [
30
- 3,
31
- 256,
32
- 256
33
- ]
34
- },
35
  "observation.images.overhead": {
36
  "type": "VISUAL",
37
  "shape": [
38
  3,
39
- 256,
40
- 256
41
  ]
42
  },
43
  "observation.images.wrist": {
44
  "type": "VISUAL",
45
  "shape": [
46
  3,
47
- 256,
48
- 256
49
  ]
50
  }
51
  },
@@ -64,9 +40,9 @@
64
  "private": null,
65
  "tags": null,
66
  "license": null,
67
- "pretrained_path": "lerobot/smolvla_base",
68
- "chunk_size": 50,
69
- "n_action_steps": 50,
70
  "normalization_mapping": {
71
  "VISUAL": "IDENTITY",
72
  "STATE": "MEAN_STD",
@@ -94,17 +70,17 @@
94
  ],
95
  "optimizer_eps": 1e-08,
96
  "optimizer_weight_decay": 1e-10,
97
- "optimizer_grad_clip_norm": 10.0,
98
  "scheduler_warmup_steps": 1000,
99
  "scheduler_decay_steps": 30000,
100
  "scheduler_decay_lr": 2.5e-06,
101
  "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
102
- "load_vlm_weights": true,
103
  "add_image_special_tokens": false,
104
  "attention_mode": "cross_attn",
105
- "prefix_length": 0,
106
- "pad_language_to": "max_length",
107
- "num_expert_layers": 0,
108
  "num_vlm_layers": 16,
109
  "self_attn_every_n_layers": 2,
110
  "expert_width_multiplier": 0.75,
 
8
  6
9
  ]
10
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  "observation.images.overhead": {
12
  "type": "VISUAL",
13
  "shape": [
14
  3,
15
+ 720,
16
+ 1280
17
  ]
18
  },
19
  "observation.images.wrist": {
20
  "type": "VISUAL",
21
  "shape": [
22
  3,
23
+ 720,
24
+ 1280
25
  ]
26
  }
27
  },
 
40
  "private": null,
41
  "tags": null,
42
  "license": null,
43
+ "pretrained_path": null,
44
+ "chunk_size": 20,
45
+ "n_action_steps": 20,
46
  "normalization_mapping": {
47
  "VISUAL": "IDENTITY",
48
  "STATE": "MEAN_STD",
 
70
  ],
71
  "optimizer_eps": 1e-08,
72
  "optimizer_weight_decay": 1e-10,
73
+ "optimizer_grad_clip_norm": 10,
74
  "scheduler_warmup_steps": 1000,
75
  "scheduler_decay_steps": 30000,
76
  "scheduler_decay_lr": 2.5e-06,
77
  "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
78
+ "load_vlm_weights": false,
79
  "add_image_special_tokens": false,
80
  "attention_mode": "cross_attn",
81
+ "prefix_length": -1,
82
+ "pad_language_to": "longest",
83
+ "num_expert_layers": -1,
84
  "num_vlm_layers": 16,
85
  "self_attn_every_n_layers": 2,
86
  "expert_width_multiplier": 0.75,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2f468e47143294e1793198f74fc41648f5a8196e7da5f0a948c6dd9af4fb89c
3
- size 906712520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bad94f2fccee6bb0032045222af18fa89ba7ad2a27fc1f9efafbf4823992708c
3
+ size 1197789224
train_config.json CHANGED
@@ -76,7 +76,7 @@
76
  },
77
  "revision": null,
78
  "use_imagenet_stats": true,
79
- "video_backend": "pyav",
80
  "streaming": false
81
  },
82
  "env": null,
@@ -90,44 +90,20 @@
90
  6
91
  ]
92
  },
93
- "observation.images.camera1": {
94
- "type": "VISUAL",
95
- "shape": [
96
- 3,
97
- 256,
98
- 256
99
- ]
100
- },
101
- "observation.images.camera2": {
102
- "type": "VISUAL",
103
- "shape": [
104
- 3,
105
- 256,
106
- 256
107
- ]
108
- },
109
- "observation.images.camera3": {
110
- "type": "VISUAL",
111
- "shape": [
112
- 3,
113
- 256,
114
- 256
115
- ]
116
- },
117
  "observation.images.overhead": {
118
  "type": "VISUAL",
119
  "shape": [
120
  3,
121
- 256,
122
- 256
123
  ]
124
  },
125
  "observation.images.wrist": {
126
  "type": "VISUAL",
127
  "shape": [
128
  3,
129
- 256,
130
- 256
131
  ]
132
  }
133
  },
@@ -146,9 +122,9 @@
146
  "private": null,
147
  "tags": null,
148
  "license": null,
149
- "pretrained_path": "lerobot/smolvla_base",
150
- "chunk_size": 50,
151
- "n_action_steps": 50,
152
  "normalization_mapping": {
153
  "VISUAL": "IDENTITY",
154
  "STATE": "MEAN_STD",
@@ -176,24 +152,24 @@
176
  ],
177
  "optimizer_eps": 1e-08,
178
  "optimizer_weight_decay": 1e-10,
179
- "optimizer_grad_clip_norm": 10.0,
180
  "scheduler_warmup_steps": 1000,
181
  "scheduler_decay_steps": 30000,
182
  "scheduler_decay_lr": 2.5e-06,
183
  "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
184
- "load_vlm_weights": true,
185
  "add_image_special_tokens": false,
186
  "attention_mode": "cross_attn",
187
- "prefix_length": 0,
188
- "pad_language_to": "max_length",
189
- "num_expert_layers": 0,
190
  "num_vlm_layers": 16,
191
  "self_attn_every_n_layers": 2,
192
  "expert_width_multiplier": 0.75,
193
  "min_period": 0.004,
194
  "max_period": 4.0
195
  },
196
- "output_dir": "outputs/train/2025-10-20/05-54-55_smolvla",
197
  "job_name": "smolvla",
198
  "resume": false,
199
  "seed": 1000,
@@ -209,7 +185,7 @@
209
  "type": "adamw",
210
  "lr": 0.0001,
211
  "weight_decay": 1e-10,
212
- "grad_clip_norm": 10.0,
213
  "betas": [
214
  0.9,
215
  0.95
@@ -234,7 +210,7 @@
234
  "project": "lerobot",
235
  "entity": null,
236
  "notes": null,
237
- "run_id": "ji942rc2",
238
  "mode": null
239
  }
240
  }
 
76
  },
77
  "revision": null,
78
  "use_imagenet_stats": true,
79
+ "video_backend": "torchcodec",
80
  "streaming": false
81
  },
82
  "env": null,
 
90
  6
91
  ]
92
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  "observation.images.overhead": {
94
  "type": "VISUAL",
95
  "shape": [
96
  3,
97
+ 720,
98
+ 1280
99
  ]
100
  },
101
  "observation.images.wrist": {
102
  "type": "VISUAL",
103
  "shape": [
104
  3,
105
+ 720,
106
+ 1280
107
  ]
108
  }
109
  },
 
122
  "private": null,
123
  "tags": null,
124
  "license": null,
125
+ "pretrained_path": null,
126
+ "chunk_size": 20,
127
+ "n_action_steps": 20,
128
  "normalization_mapping": {
129
  "VISUAL": "IDENTITY",
130
  "STATE": "MEAN_STD",
 
152
  ],
153
  "optimizer_eps": 1e-08,
154
  "optimizer_weight_decay": 1e-10,
155
+ "optimizer_grad_clip_norm": 10,
156
  "scheduler_warmup_steps": 1000,
157
  "scheduler_decay_steps": 30000,
158
  "scheduler_decay_lr": 2.5e-06,
159
  "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
160
+ "load_vlm_weights": false,
161
  "add_image_special_tokens": false,
162
  "attention_mode": "cross_attn",
163
+ "prefix_length": -1,
164
+ "pad_language_to": "longest",
165
+ "num_expert_layers": -1,
166
  "num_vlm_layers": 16,
167
  "self_attn_every_n_layers": 2,
168
  "expert_width_multiplier": 0.75,
169
  "min_period": 0.004,
170
  "max_period": 4.0
171
  },
172
+ "output_dir": "outputs/train/2025-10-21/05-34-13_smolvla",
173
  "job_name": "smolvla",
174
  "resume": false,
175
  "seed": 1000,
 
185
  "type": "adamw",
186
  "lr": 0.0001,
187
  "weight_decay": 1e-10,
188
+ "grad_clip_norm": 10,
189
  "betas": [
190
  0.9,
191
  0.95
 
210
  "project": "lerobot",
211
  "entity": null,
212
  "notes": null,
213
+ "run_id": "m4h1qq5w",
214
  "mode": null
215
  }
216
  }