Robotics
LeRobot
Safetensors
diffusion
jjjeonghi commited on
Commit
1e3ff7b
·
verified ·
1 Parent(s): 01558e0

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +5 -4
  2. config.json +39 -40
  3. model.safetensors +2 -2
  4. train_config.json +51 -50
README.md CHANGED
@@ -1,21 +1,22 @@
1
  ---
 
2
  datasets: data/merged_multitask
3
  library_name: lerobot
4
  license: apache-2.0
5
- model_name: diffusion
6
  pipeline_tag: robotics
7
  tags:
8
  - lerobot
9
  - robotics
10
- - diffusion
11
  ---
12
 
13
- # Model Card for diffusion
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
- [Diffusion Policy](https://huggingface.co/papers/2303.04137) treats visuomotor control as a generative diffusion process, producing smooth, multi-step action trajectories that excel at contact-rich manipulation.
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
1
  ---
2
+ base_model: lerobot/smolvla_base
3
  datasets: data/merged_multitask
4
  library_name: lerobot
5
  license: apache-2.0
6
+ model_name: smolvla
7
  pipeline_tag: robotics
8
  tags:
9
  - lerobot
10
  - robotics
11
+ - smolvla
12
  ---
13
 
14
+ # Model Card for smolvla
15
 
16
  <!-- Provide a quick summary of what the model is/does. -->
17
 
18
 
19
+ [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
 
21
 
22
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "type": "diffusion",
3
- "n_obs_steps": 2,
4
  "input_features": {
5
  "observation.state": {
6
  "type": "STATE",
@@ -49,50 +49,49 @@
49
  "tags": null,
50
  "license": null,
51
  "pretrained_path": null,
52
- "horizon": 16,
53
- "n_action_steps": 8,
54
  "normalization_mapping": {
55
- "VISUAL": "MEAN_STD",
56
- "STATE": "MIN_MAX",
57
- "ACTION": "MIN_MAX"
58
  },
59
- "drop_n_last_frames": 7,
60
- "vision_backbone": "resnet18",
61
- "crop_shape": [
62
- 84,
63
- 84
64
- ],
65
- "crop_is_random": true,
66
- "pretrained_backbone_weights": null,
67
- "use_group_norm": true,
68
- "spatial_softmax_num_keypoints": 32,
69
- "use_separate_rgb_encoder_per_camera": false,
70
- "down_dims": [
71
  512,
72
- 1024,
73
- 2048
74
  ],
75
- "kernel_size": 5,
76
- "n_groups": 8,
77
- "diffusion_step_embed_dim": 128,
78
- "use_film_scale_modulation": true,
79
- "noise_scheduler_type": "DDPM",
80
- "num_train_timesteps": 100,
81
- "beta_schedule": "squaredcos_cap_v2",
82
- "beta_start": 0.0001,
83
- "beta_end": 0.02,
84
- "prediction_type": "epsilon",
85
- "clip_sample": true,
86
- "clip_sample_range": 1.0,
87
- "num_inference_steps": null,
88
- "do_mask_loss_for_padding": false,
89
  "optimizer_lr": 0.0001,
90
  "optimizer_betas": [
91
- 0.95,
92
- 0.999
93
  ],
94
  "optimizer_eps": 1e-08,
95
- "optimizer_weight_decay": 1e-06,
96
- "scheduler_name": "cosine",
97
- "scheduler_warmup_steps": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  }
 
1
  {
2
+ "type": "smolvla",
3
+ "n_obs_steps": 1,
4
  "input_features": {
5
  "observation.state": {
6
  "type": "STATE",
 
49
  "tags": null,
50
  "license": null,
51
  "pretrained_path": null,
52
+ "chunk_size": 50,
53
+ "n_action_steps": 50,
54
  "normalization_mapping": {
55
+ "VISUAL": "IDENTITY",
56
+ "STATE": "MEAN_STD",
57
+ "ACTION": "MEAN_STD"
58
  },
59
+ "max_state_dim": 32,
60
+ "max_action_dim": 32,
61
+ "resize_imgs_with_padding": [
 
 
 
 
 
 
 
 
 
62
  512,
63
+ 512
 
64
  ],
65
+ "empty_cameras": 0,
66
+ "adapt_to_pi_aloha": false,
67
+ "use_delta_joint_actions_aloha": false,
68
+ "tokenizer_max_length": 48,
69
+ "num_steps": 10,
70
+ "use_cache": true,
71
+ "freeze_vision_encoder": true,
72
+ "train_expert_only": true,
73
+ "train_state_proj": true,
 
 
 
 
 
74
  "optimizer_lr": 0.0001,
75
  "optimizer_betas": [
76
+ 0.9,
77
+ 0.95
78
  ],
79
  "optimizer_eps": 1e-08,
80
+ "optimizer_weight_decay": 1e-10,
81
+ "optimizer_grad_clip_norm": 10,
82
+ "scheduler_warmup_steps": 1000,
83
+ "scheduler_decay_steps": 30000,
84
+ "scheduler_decay_lr": 2.5e-06,
85
+ "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
86
+ "load_vlm_weights": false,
87
+ "add_image_special_tokens": false,
88
+ "attention_mode": "cross_attn",
89
+ "prefix_length": -1,
90
+ "pad_language_to": "longest",
91
+ "num_expert_layers": -1,
92
+ "num_vlm_layers": 16,
93
+ "self_attn_every_n_layers": 2,
94
+ "expert_width_multiplier": 0.75,
95
+ "min_period": 0.004,
96
+ "max_period": 4.0
97
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0104bcfa0d8809416cd4016f0653b497e5d9752cd336bfd664550879ed17e6cf
3
- size 1082658752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8378d98d2f003917a142cf2bc75628f6a3adb04be96ef1822f149bd5436282e9
3
+ size 1197789224
train_config.json CHANGED
@@ -81,8 +81,8 @@
81
  },
82
  "env": null,
83
  "policy": {
84
- "type": "diffusion",
85
- "n_obs_steps": 2,
86
  "input_features": {
87
  "observation.state": {
88
  "type": "STATE",
@@ -131,55 +131,54 @@
131
  "tags": null,
132
  "license": null,
133
  "pretrained_path": null,
134
- "horizon": 16,
135
- "n_action_steps": 8,
136
  "normalization_mapping": {
137
- "VISUAL": "MEAN_STD",
138
- "STATE": "MIN_MAX",
139
- "ACTION": "MIN_MAX"
140
  },
141
- "drop_n_last_frames": 7,
142
- "vision_backbone": "resnet18",
143
- "crop_shape": [
144
- 84,
145
- 84
146
- ],
147
- "crop_is_random": true,
148
- "pretrained_backbone_weights": null,
149
- "use_group_norm": true,
150
- "spatial_softmax_num_keypoints": 32,
151
- "use_separate_rgb_encoder_per_camera": false,
152
- "down_dims": [
153
  512,
154
- 1024,
155
- 2048
156
  ],
157
- "kernel_size": 5,
158
- "n_groups": 8,
159
- "diffusion_step_embed_dim": 128,
160
- "use_film_scale_modulation": true,
161
- "noise_scheduler_type": "DDPM",
162
- "num_train_timesteps": 100,
163
- "beta_schedule": "squaredcos_cap_v2",
164
- "beta_start": 0.0001,
165
- "beta_end": 0.02,
166
- "prediction_type": "epsilon",
167
- "clip_sample": true,
168
- "clip_sample_range": 1.0,
169
- "num_inference_steps": null,
170
- "do_mask_loss_for_padding": false,
171
  "optimizer_lr": 0.0001,
172
  "optimizer_betas": [
173
- 0.95,
174
- 0.999
175
  ],
176
  "optimizer_eps": 1e-08,
177
- "optimizer_weight_decay": 1e-06,
178
- "scheduler_name": "cosine",
179
- "scheduler_warmup_steps": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  },
181
- "output_dir": "/media/choi/HDD/outputs/train/Diffusion_merged",
182
- "job_name": "diffusion_merged",
183
  "resume": false,
184
  "seed": 1000,
185
  "num_workers": 4,
@@ -191,20 +190,22 @@
191
  "save_freq": 20000,
192
  "use_policy_training_preset": true,
193
  "optimizer": {
194
- "type": "adam",
195
  "lr": 0.0001,
196
- "weight_decay": 1e-06,
197
- "grad_clip_norm": 10.0,
198
  "betas": [
199
- 0.95,
200
- 0.999
201
  ],
202
  "eps": 1e-08
203
  },
204
  "scheduler": {
205
- "type": "diffuser",
206
- "num_warmup_steps": 500,
207
- "name": "cosine"
 
 
208
  },
209
  "eval": {
210
  "n_episodes": 50,
 
81
  },
82
  "env": null,
83
  "policy": {
84
+ "type": "smolvla",
85
+ "n_obs_steps": 1,
86
  "input_features": {
87
  "observation.state": {
88
  "type": "STATE",
 
131
  "tags": null,
132
  "license": null,
133
  "pretrained_path": null,
134
+ "chunk_size": 50,
135
+ "n_action_steps": 50,
136
  "normalization_mapping": {
137
+ "VISUAL": "IDENTITY",
138
+ "STATE": "MEAN_STD",
139
+ "ACTION": "MEAN_STD"
140
  },
141
+ "max_state_dim": 32,
142
+ "max_action_dim": 32,
143
+ "resize_imgs_with_padding": [
 
 
 
 
 
 
 
 
 
144
  512,
145
+ 512
 
146
  ],
147
+ "empty_cameras": 0,
148
+ "adapt_to_pi_aloha": false,
149
+ "use_delta_joint_actions_aloha": false,
150
+ "tokenizer_max_length": 48,
151
+ "num_steps": 10,
152
+ "use_cache": true,
153
+ "freeze_vision_encoder": true,
154
+ "train_expert_only": true,
155
+ "train_state_proj": true,
 
 
 
 
 
156
  "optimizer_lr": 0.0001,
157
  "optimizer_betas": [
158
+ 0.9,
159
+ 0.95
160
  ],
161
  "optimizer_eps": 1e-08,
162
+ "optimizer_weight_decay": 1e-10,
163
+ "optimizer_grad_clip_norm": 10,
164
+ "scheduler_warmup_steps": 1000,
165
+ "scheduler_decay_steps": 30000,
166
+ "scheduler_decay_lr": 2.5e-06,
167
+ "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
168
+ "load_vlm_weights": false,
169
+ "add_image_special_tokens": false,
170
+ "attention_mode": "cross_attn",
171
+ "prefix_length": -1,
172
+ "pad_language_to": "longest",
173
+ "num_expert_layers": -1,
174
+ "num_vlm_layers": 16,
175
+ "self_attn_every_n_layers": 2,
176
+ "expert_width_multiplier": 0.75,
177
+ "min_period": 0.004,
178
+ "max_period": 4.0
179
  },
180
+ "output_dir": "/media/choi/HDD/outputs/train/smolvla_merged",
181
+ "job_name": "smolvla_merged",
182
  "resume": false,
183
  "seed": 1000,
184
  "num_workers": 4,
 
190
  "save_freq": 20000,
191
  "use_policy_training_preset": true,
192
  "optimizer": {
193
+ "type": "adamw",
194
  "lr": 0.0001,
195
+ "weight_decay": 1e-10,
196
+ "grad_clip_norm": 10,
197
  "betas": [
198
+ 0.9,
199
+ 0.95
200
  ],
201
  "eps": 1e-08
202
  },
203
  "scheduler": {
204
+ "type": "cosine_decay_with_warmup",
205
+ "num_warmup_steps": 1000,
206
+ "num_decay_steps": 30000,
207
+ "peak_lr": 0.0001,
208
+ "decay_lr": 2.5e-06
209
  },
210
  "eval": {
211
  "n_episodes": 50,