Robotics
LeRobot
Safetensors
act
jjjeonghi commited on
Commit
7bc7b65
·
verified ·
1 Parent(s): 30ff83f

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +5 -6
  2. config.json +41 -39
  3. model.safetensors +2 -2
  4. train_config.json +61 -54
README.md CHANGED
@@ -1,22 +1,21 @@
1
  ---
2
- base_model: lerobot/smolvla_base
3
- datasets: data/merged_multitask
4
  library_name: lerobot
5
  license: apache-2.0
6
- model_name: smolvla
7
  pipeline_tag: robotics
8
  tags:
9
  - lerobot
10
  - robotics
11
- - smolvla
12
  ---
13
 
14
- # Model Card for smolvla
15
 
16
  <!-- Provide a quick summary of what the model is/does. -->
17
 
18
 
19
- [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
 
21
 
22
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
1
  ---
2
+ datasets: data/banana_plate
 
3
  library_name: lerobot
4
  license: apache-2.0
5
+ model_name: diffusion
6
  pipeline_tag: robotics
7
  tags:
8
  - lerobot
9
  - robotics
10
+ - diffusion
11
  ---
12
 
13
+ # Model Card for diffusion
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
+ [Diffusion Policy](https://huggingface.co/papers/2303.04137) treats visuomotor control as a generative diffusion process, producing smooth, multi-step action trajectories that excel at contact-rich manipulation.
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "type": "smolvla",
3
- "n_obs_steps": 1,
4
  "input_features": {
5
  "observation.state": {
6
  "type": "STATE",
@@ -43,55 +43,57 @@
43
  },
44
  "device": "cuda",
45
  "use_amp": false,
 
46
  "push_to_hub": true,
47
  "repo_id": "False",
48
  "private": null,
49
  "tags": null,
50
  "license": null,
51
  "pretrained_path": null,
52
- "chunk_size": 50,
53
- "n_action_steps": 50,
54
  "normalization_mapping": {
55
- "VISUAL": "IDENTITY",
56
- "STATE": "MEAN_STD",
57
- "ACTION": "MEAN_STD"
58
  },
59
- "max_state_dim": 32,
60
- "max_action_dim": 32,
61
- "resize_imgs_with_padding": [
 
 
 
 
 
 
 
 
 
62
  512,
63
- 512
 
64
  ],
65
- "empty_cameras": 0,
66
- "adapt_to_pi_aloha": false,
67
- "use_delta_joint_actions_aloha": false,
68
- "tokenizer_max_length": 48,
69
- "num_steps": 10,
70
- "use_cache": true,
71
- "freeze_vision_encoder": true,
72
- "train_expert_only": true,
73
- "train_state_proj": true,
 
 
 
 
 
74
  "optimizer_lr": 0.0001,
75
  "optimizer_betas": [
76
- 0.9,
77
- 0.95
78
  ],
79
  "optimizer_eps": 1e-08,
80
- "optimizer_weight_decay": 1e-10,
81
- "optimizer_grad_clip_norm": 10,
82
- "scheduler_warmup_steps": 1000,
83
- "scheduler_decay_steps": 30000,
84
- "scheduler_decay_lr": 2.5e-06,
85
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
86
- "load_vlm_weights": false,
87
- "add_image_special_tokens": false,
88
- "attention_mode": "cross_attn",
89
- "prefix_length": -1,
90
- "pad_language_to": "longest",
91
- "num_expert_layers": -1,
92
- "num_vlm_layers": 16,
93
- "self_attn_every_n_layers": 2,
94
- "expert_width_multiplier": 0.75,
95
- "min_period": 0.004,
96
- "max_period": 4.0
97
  }
 
1
  {
2
+ "type": "diffusion",
3
+ "n_obs_steps": 2,
4
  "input_features": {
5
  "observation.state": {
6
  "type": "STATE",
 
43
  },
44
  "device": "cuda",
45
  "use_amp": false,
46
+ "use_peft": false,
47
  "push_to_hub": true,
48
  "repo_id": "False",
49
  "private": null,
50
  "tags": null,
51
  "license": null,
52
  "pretrained_path": null,
53
+ "horizon": 16,
54
+ "n_action_steps": 8,
55
  "normalization_mapping": {
56
+ "VISUAL": "MEAN_STD",
57
+ "STATE": "MIN_MAX",
58
+ "ACTION": "MIN_MAX"
59
  },
60
+ "drop_n_last_frames": 7,
61
+ "vision_backbone": "resnet18",
62
+ "crop_shape": [
63
+ 84,
64
+ 84
65
+ ],
66
+ "crop_is_random": true,
67
+ "pretrained_backbone_weights": null,
68
+ "use_group_norm": true,
69
+ "spatial_softmax_num_keypoints": 32,
70
+ "use_separate_rgb_encoder_per_camera": false,
71
+ "down_dims": [
72
  512,
73
+ 1024,
74
+ 2048
75
  ],
76
+ "kernel_size": 5,
77
+ "n_groups": 8,
78
+ "diffusion_step_embed_dim": 128,
79
+ "use_film_scale_modulation": true,
80
+ "noise_scheduler_type": "DDPM",
81
+ "num_train_timesteps": 100,
82
+ "beta_schedule": "squaredcos_cap_v2",
83
+ "beta_start": 0.0001,
84
+ "beta_end": 0.02,
85
+ "prediction_type": "epsilon",
86
+ "clip_sample": true,
87
+ "clip_sample_range": 1.0,
88
+ "num_inference_steps": null,
89
+ "do_mask_loss_for_padding": false,
90
  "optimizer_lr": 0.0001,
91
  "optimizer_betas": [
92
+ 0.95,
93
+ 0.999
94
  ],
95
  "optimizer_eps": 1e-08,
96
+ "optimizer_weight_decay": 1e-06,
97
+ "scheduler_name": "cosine",
98
+ "scheduler_warmup_steps": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8378d98d2f003917a142cf2bc75628f6a3adb04be96ef1822f149bd5436282e9
3
- size 1197789224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ab9433255460c5696acd8c2c0a083c57bc4903f60d8f31f3c93342a97f5ebd0
3
+ size 1082658752
train_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "dataset": {
3
- "repo_id": "data/merged_multitask",
4
  "root": null,
5
  "episodes": null,
6
  "image_transforms": {
@@ -81,8 +81,8 @@
81
  },
82
  "env": null,
83
  "policy": {
84
- "type": "smolvla",
85
- "n_obs_steps": 1,
86
  "input_features": {
87
  "observation.state": {
88
  "type": "STATE",
@@ -125,60 +125,62 @@
125
  },
126
  "device": "cuda",
127
  "use_amp": false,
 
128
  "push_to_hub": true,
129
  "repo_id": "False",
130
  "private": null,
131
  "tags": null,
132
  "license": null,
133
  "pretrained_path": null,
134
- "chunk_size": 50,
135
- "n_action_steps": 50,
136
  "normalization_mapping": {
137
- "VISUAL": "IDENTITY",
138
- "STATE": "MEAN_STD",
139
- "ACTION": "MEAN_STD"
140
  },
141
- "max_state_dim": 32,
142
- "max_action_dim": 32,
143
- "resize_imgs_with_padding": [
 
 
 
 
 
 
 
 
 
144
  512,
145
- 512
 
146
  ],
147
- "empty_cameras": 0,
148
- "adapt_to_pi_aloha": false,
149
- "use_delta_joint_actions_aloha": false,
150
- "tokenizer_max_length": 48,
151
- "num_steps": 10,
152
- "use_cache": true,
153
- "freeze_vision_encoder": true,
154
- "train_expert_only": true,
155
- "train_state_proj": true,
 
 
 
 
 
156
  "optimizer_lr": 0.0001,
157
  "optimizer_betas": [
158
- 0.9,
159
- 0.95
160
  ],
161
  "optimizer_eps": 1e-08,
162
- "optimizer_weight_decay": 1e-10,
163
- "optimizer_grad_clip_norm": 10,
164
- "scheduler_warmup_steps": 1000,
165
- "scheduler_decay_steps": 30000,
166
- "scheduler_decay_lr": 2.5e-06,
167
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
168
- "load_vlm_weights": false,
169
- "add_image_special_tokens": false,
170
- "attention_mode": "cross_attn",
171
- "prefix_length": -1,
172
- "pad_language_to": "longest",
173
- "num_expert_layers": -1,
174
- "num_vlm_layers": 16,
175
- "self_attn_every_n_layers": 2,
176
- "expert_width_multiplier": 0.75,
177
- "min_period": 0.004,
178
- "max_period": 4.0
179
  },
180
- "output_dir": "/media/choi/HDD/outputs/train/smolvla_merged",
181
- "job_name": "smolvla_merged",
182
  "resume": false,
183
  "seed": 1000,
184
  "num_workers": 4,
@@ -186,26 +188,25 @@
186
  "steps": 100000,
187
  "eval_freq": 20000,
188
  "log_freq": 200,
 
189
  "save_checkpoint": true,
190
  "save_freq": 20000,
191
  "use_policy_training_preset": true,
192
  "optimizer": {
193
- "type": "adamw",
194
  "lr": 0.0001,
195
- "weight_decay": 1e-10,
196
- "grad_clip_norm": 10,
197
  "betas": [
198
- 0.9,
199
- 0.95
200
  ],
201
  "eps": 1e-08
202
  },
203
  "scheduler": {
204
- "type": "cosine_decay_with_warmup",
205
- "num_warmup_steps": 1000,
206
- "num_decay_steps": 30000,
207
- "peak_lr": 0.0001,
208
- "decay_lr": 2.5e-06
209
  },
210
  "eval": {
211
  "n_episodes": 50,
@@ -221,6 +222,12 @@
221
  "run_id": null,
222
  "mode": null
223
  },
224
- "checkpoint_path": null,
225
- "rename_map": {}
 
 
 
 
 
 
226
  }
 
1
  {
2
  "dataset": {
3
+ "repo_id": "data/banana_plate",
4
  "root": null,
5
  "episodes": null,
6
  "image_transforms": {
 
81
  },
82
  "env": null,
83
  "policy": {
84
+ "type": "diffusion",
85
+ "n_obs_steps": 2,
86
  "input_features": {
87
  "observation.state": {
88
  "type": "STATE",
 
125
  },
126
  "device": "cuda",
127
  "use_amp": false,
128
+ "use_peft": false,
129
  "push_to_hub": true,
130
  "repo_id": "False",
131
  "private": null,
132
  "tags": null,
133
  "license": null,
134
  "pretrained_path": null,
135
+ "horizon": 16,
136
+ "n_action_steps": 8,
137
  "normalization_mapping": {
138
+ "VISUAL": "MEAN_STD",
139
+ "STATE": "MIN_MAX",
140
+ "ACTION": "MIN_MAX"
141
  },
142
+ "drop_n_last_frames": 7,
143
+ "vision_backbone": "resnet18",
144
+ "crop_shape": [
145
+ 84,
146
+ 84
147
+ ],
148
+ "crop_is_random": true,
149
+ "pretrained_backbone_weights": null,
150
+ "use_group_norm": true,
151
+ "spatial_softmax_num_keypoints": 32,
152
+ "use_separate_rgb_encoder_per_camera": false,
153
+ "down_dims": [
154
  512,
155
+ 1024,
156
+ 2048
157
  ],
158
+ "kernel_size": 5,
159
+ "n_groups": 8,
160
+ "diffusion_step_embed_dim": 128,
161
+ "use_film_scale_modulation": true,
162
+ "noise_scheduler_type": "DDPM",
163
+ "num_train_timesteps": 100,
164
+ "beta_schedule": "squaredcos_cap_v2",
165
+ "beta_start": 0.0001,
166
+ "beta_end": 0.02,
167
+ "prediction_type": "epsilon",
168
+ "clip_sample": true,
169
+ "clip_sample_range": 1.0,
170
+ "num_inference_steps": null,
171
+ "do_mask_loss_for_padding": false,
172
  "optimizer_lr": 0.0001,
173
  "optimizer_betas": [
174
+ 0.95,
175
+ 0.999
176
  ],
177
  "optimizer_eps": 1e-08,
178
+ "optimizer_weight_decay": 1e-06,
179
+ "scheduler_name": "cosine",
180
+ "scheduler_warmup_steps": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  },
182
+ "output_dir": "/media/choi/HDD/outputs/train/Diffusion_banana_plate",
183
+ "job_name": "diffusion_merged",
184
  "resume": false,
185
  "seed": 1000,
186
  "num_workers": 4,
 
188
  "steps": 100000,
189
  "eval_freq": 20000,
190
  "log_freq": 200,
191
+ "tolerance_s": 0.0001,
192
  "save_checkpoint": true,
193
  "save_freq": 20000,
194
  "use_policy_training_preset": true,
195
  "optimizer": {
196
+ "type": "adam",
197
  "lr": 0.0001,
198
+ "weight_decay": 1e-06,
199
+ "grad_clip_norm": 10.0,
200
  "betas": [
201
+ 0.95,
202
+ 0.999
203
  ],
204
  "eps": 1e-08
205
  },
206
  "scheduler": {
207
+ "type": "diffuser",
208
+ "num_warmup_steps": 500,
209
+ "name": "cosine"
 
 
210
  },
211
  "eval": {
212
  "n_episodes": 50,
 
222
  "run_id": null,
223
  "mode": null
224
  },
225
+ "peft": null,
226
+ "use_rabc": false,
227
+ "rabc_progress_path": null,
228
+ "rabc_kappa": 0.01,
229
+ "rabc_epsilon": 1e-06,
230
+ "rabc_head_mode": "sparse",
231
+ "rename_map": {},
232
+ "checkpoint_path": null
233
  }