Robotics
LeRobot
Safetensors
pi0
Beilinghamburger commited on
Commit
bda6329
·
verified ·
1 Parent(s): d6f427c

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +6 -7
  2. config.json +14 -27
  3. model.safetensors +2 -2
  4. train_config.json +25 -54
README.md CHANGED
@@ -1,22 +1,21 @@
1
  ---
2
- base_model: lerobot/smolvla_base
3
- datasets: Beilinghamburger/so100_vla_dataset_100
4
  library_name: lerobot
5
  license: apache-2.0
6
- model_name: smolvla
7
  pipeline_tag: robotics
8
  tags:
9
- - lerobot
10
  - robotics
11
- - smolvla
12
  ---
13
 
14
- # Model Card for smolvla
15
 
16
  <!-- Provide a quick summary of what the model is/does. -->
17
 
18
 
19
- [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
 
21
 
22
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
1
  ---
2
+ datasets: Beilinghamburger/so100_vla_dataset
 
3
  library_name: lerobot
4
  license: apache-2.0
5
+ model_name: pi0
6
  pipeline_tag: robotics
7
  tags:
8
+ - pi0
9
  - robotics
10
+ - lerobot
11
  ---
12
 
13
+ # Model Card for pi0
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
+ [Pi0](https://huggingface.co/papers/2410.24164) is a generalist vision-language-action transformer that converts multimodal observations and text instructions into robot actions for zero-shot task transfer.
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
config.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
- "type": "smolvla",
3
  "n_obs_steps": 1,
 
 
 
 
 
4
  "input_features": {
5
  "observation.state": {
6
  "type": "STATE",
@@ -33,58 +38,40 @@
33
  ]
34
  }
35
  },
36
- "device": "cuda",
37
  "use_amp": false,
38
  "push_to_hub": true,
39
  "repo_id": "Beilinghamburger/smolvla_so100_vla",
40
  "private": null,
41
  "tags": null,
42
  "license": null,
43
- "pretrained_path": null,
44
  "chunk_size": 50,
45
  "n_action_steps": 50,
46
- "normalization_mapping": {
47
- "VISUAL": "IDENTITY",
48
- "STATE": "MEAN_STD",
49
- "ACTION": "MEAN_STD"
50
- },
51
  "max_state_dim": 32,
52
  "max_action_dim": 32,
53
  "resize_imgs_with_padding": [
54
- 512,
55
- 512
56
  ],
57
  "empty_cameras": 0,
58
  "adapt_to_pi_aloha": false,
59
  "use_delta_joint_actions_aloha": false,
60
  "tokenizer_max_length": 48,
 
61
  "num_steps": 10,
62
  "use_cache": true,
 
63
  "freeze_vision_encoder": true,
64
- "train_expert_only": true,
65
  "train_state_proj": true,
66
- "optimizer_lr": 0.0001,
67
  "optimizer_betas": [
68
  0.9,
69
  0.95
70
  ],
71
  "optimizer_eps": 1e-08,
72
  "optimizer_weight_decay": 1e-10,
73
- "optimizer_grad_clip_norm": 10,
74
  "scheduler_warmup_steps": 1000,
75
  "scheduler_decay_steps": 30000,
76
- "scheduler_decay_lr": 2.5e-06,
77
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
78
- "load_vlm_weights": false,
79
- "add_image_special_tokens": false,
80
- "attention_mode": "cross_attn",
81
- "prefix_length": -1,
82
- "pad_language_to": "longest",
83
- "num_expert_layers": -1,
84
- "num_vlm_layers": 16,
85
- "self_attn_every_n_layers": 2,
86
- "expert_width_multiplier": 0.75,
87
- "min_period": 0.004,
88
- "max_period": 4.0,
89
- "rtc_config": null
90
  }
 
1
  {
2
+ "type": "pi0",
3
  "n_obs_steps": 1,
4
+ "normalization_mapping": {
5
+ "VISUAL": "IDENTITY",
6
+ "STATE": "MEAN_STD",
7
+ "ACTION": "MEAN_STD"
8
+ },
9
  "input_features": {
10
  "observation.state": {
11
  "type": "STATE",
 
38
  ]
39
  }
40
  },
41
+ "device": "cpu",
42
  "use_amp": false,
43
  "push_to_hub": true,
44
  "repo_id": "Beilinghamburger/smolvla_so100_vla",
45
  "private": null,
46
  "tags": null,
47
  "license": null,
 
48
  "chunk_size": 50,
49
  "n_action_steps": 50,
 
 
 
 
 
50
  "max_state_dim": 32,
51
  "max_action_dim": 32,
52
  "resize_imgs_with_padding": [
53
+ 224,
54
+ 224
55
  ],
56
  "empty_cameras": 0,
57
  "adapt_to_pi_aloha": false,
58
  "use_delta_joint_actions_aloha": false,
59
  "tokenizer_max_length": 48,
60
+ "proj_width": 1024,
61
  "num_steps": 10,
62
  "use_cache": true,
63
+ "attention_implementation": "eager",
64
  "freeze_vision_encoder": true,
65
+ "train_expert_only": false,
66
  "train_state_proj": true,
67
+ "optimizer_lr": 2.5e-05,
68
  "optimizer_betas": [
69
  0.9,
70
  0.95
71
  ],
72
  "optimizer_eps": 1e-08,
73
  "optimizer_weight_decay": 1e-10,
 
74
  "scheduler_warmup_steps": 1000,
75
  "scheduler_decay_steps": 30000,
76
+ "scheduler_decay_lr": 2.5e-06
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1660564e0274422231d942fe1dea8857151657ecbf80a572d101a39f0b632198
3
- size 1197789224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27936bb0da4cc16781f1dc4a19419790a56c3f547707cd4c38ba78386954d94d
3
+ size 7536025152
train_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "dataset": {
3
- "repo_id": "Beilinghamburger/so100_vla_dataset_100",
4
  "root": null,
5
  "episodes": null,
6
  "image_transforms": {
@@ -57,20 +57,6 @@
57
  1.5
58
  ]
59
  }
60
- },
61
- "affine": {
62
- "weight": 1.0,
63
- "type": "RandomAffine",
64
- "kwargs": {
65
- "degrees": [
66
- -5.0,
67
- 5.0
68
- ],
69
- "translate": [
70
- 0.05,
71
- 0.05
72
- ]
73
- }
74
  }
75
  }
76
  },
@@ -81,8 +67,13 @@
81
  },
82
  "env": null,
83
  "policy": {
84
- "type": "smolvla",
85
  "n_obs_steps": 1,
 
 
 
 
 
86
  "input_features": {
87
  "observation.state": {
88
  "type": "STATE",
@@ -115,78 +106,60 @@
115
  ]
116
  }
117
  },
118
- "device": "cuda",
119
  "use_amp": false,
120
  "push_to_hub": true,
121
  "repo_id": "Beilinghamburger/smolvla_so100_vla",
122
  "private": null,
123
  "tags": null,
124
  "license": null,
125
- "pretrained_path": null,
126
  "chunk_size": 50,
127
  "n_action_steps": 50,
128
- "normalization_mapping": {
129
- "VISUAL": "IDENTITY",
130
- "STATE": "MEAN_STD",
131
- "ACTION": "MEAN_STD"
132
- },
133
  "max_state_dim": 32,
134
  "max_action_dim": 32,
135
  "resize_imgs_with_padding": [
136
- 512,
137
- 512
138
  ],
139
  "empty_cameras": 0,
140
  "adapt_to_pi_aloha": false,
141
  "use_delta_joint_actions_aloha": false,
142
  "tokenizer_max_length": 48,
 
143
  "num_steps": 10,
144
  "use_cache": true,
 
145
  "freeze_vision_encoder": true,
146
- "train_expert_only": true,
147
  "train_state_proj": true,
148
- "optimizer_lr": 0.0001,
149
  "optimizer_betas": [
150
  0.9,
151
  0.95
152
  ],
153
  "optimizer_eps": 1e-08,
154
  "optimizer_weight_decay": 1e-10,
155
- "optimizer_grad_clip_norm": 10,
156
  "scheduler_warmup_steps": 1000,
157
  "scheduler_decay_steps": 30000,
158
- "scheduler_decay_lr": 2.5e-06,
159
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
160
- "load_vlm_weights": false,
161
- "add_image_special_tokens": false,
162
- "attention_mode": "cross_attn",
163
- "prefix_length": -1,
164
- "pad_language_to": "longest",
165
- "num_expert_layers": -1,
166
- "num_vlm_layers": 16,
167
- "self_attn_every_n_layers": 2,
168
- "expert_width_multiplier": 0.75,
169
- "min_period": 0.004,
170
- "max_period": 4.0,
171
- "rtc_config": null
172
  },
173
- "output_dir": "outputs/train/smolvla_so100_vla1215_new",
174
- "job_name": "smolvla_so100_vla",
175
  "resume": false,
176
  "seed": 1000,
177
  "num_workers": 4,
178
- "batch_size": 32,
179
- "steps": 40000,
180
- "eval_freq": 10000,
181
  "log_freq": 200,
182
  "save_checkpoint": true,
183
- "save_freq": 10000,
184
  "use_policy_training_preset": true,
185
  "optimizer": {
186
  "type": "adamw",
187
- "lr": 0.0001,
188
  "weight_decay": 1e-10,
189
- "grad_clip_norm": 10,
190
  "betas": [
191
  0.9,
192
  0.95
@@ -197,7 +170,7 @@
197
  "type": "cosine_decay_with_warmup",
198
  "num_warmup_steps": 1000,
199
  "num_decay_steps": 30000,
200
- "peak_lr": 0.0001,
201
  "decay_lr": 2.5e-06
202
  },
203
  "eval": {
@@ -213,7 +186,5 @@
213
  "notes": null,
214
  "run_id": null,
215
  "mode": null
216
- },
217
- "checkpoint_path": null,
218
- "rename_map": {}
219
  }
 
1
  {
2
  "dataset": {
3
+ "repo_id": "Beilinghamburger/so100_vla_dataset",
4
  "root": null,
5
  "episodes": null,
6
  "image_transforms": {
 
57
  1.5
58
  ]
59
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  }
61
  }
62
  },
 
67
  },
68
  "env": null,
69
  "policy": {
70
+ "type": "pi0",
71
  "n_obs_steps": 1,
72
+ "normalization_mapping": {
73
+ "VISUAL": "IDENTITY",
74
+ "STATE": "MEAN_STD",
75
+ "ACTION": "MEAN_STD"
76
+ },
77
  "input_features": {
78
  "observation.state": {
79
  "type": "STATE",
 
106
  ]
107
  }
108
  },
109
+ "device": "cpu",
110
  "use_amp": false,
111
  "push_to_hub": true,
112
  "repo_id": "Beilinghamburger/smolvla_so100_vla",
113
  "private": null,
114
  "tags": null,
115
  "license": null,
 
116
  "chunk_size": 50,
117
  "n_action_steps": 50,
 
 
 
 
 
118
  "max_state_dim": 32,
119
  "max_action_dim": 32,
120
  "resize_imgs_with_padding": [
121
+ 224,
122
+ 224
123
  ],
124
  "empty_cameras": 0,
125
  "adapt_to_pi_aloha": false,
126
  "use_delta_joint_actions_aloha": false,
127
  "tokenizer_max_length": 48,
128
+ "proj_width": 1024,
129
  "num_steps": 10,
130
  "use_cache": true,
131
+ "attention_implementation": "eager",
132
  "freeze_vision_encoder": true,
133
+ "train_expert_only": false,
134
  "train_state_proj": true,
135
+ "optimizer_lr": 2.5e-05,
136
  "optimizer_betas": [
137
  0.9,
138
  0.95
139
  ],
140
  "optimizer_eps": 1e-08,
141
  "optimizer_weight_decay": 1e-10,
 
142
  "scheduler_warmup_steps": 1000,
143
  "scheduler_decay_steps": 30000,
144
+ "scheduler_decay_lr": 2.5e-06
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  },
146
+ "output_dir": "outputs/train/pi_so100_vla1215",
147
+ "job_name": "pi_so100_vla",
148
  "resume": false,
149
  "seed": 1000,
150
  "num_workers": 4,
151
+ "batch_size": 4,
152
+ "steps": 1,
153
+ "eval_freq": 100,
154
  "log_freq": 200,
155
  "save_checkpoint": true,
156
+ "save_freq": 100,
157
  "use_policy_training_preset": true,
158
  "optimizer": {
159
  "type": "adamw",
160
+ "lr": 2.5e-05,
161
  "weight_decay": 1e-10,
162
+ "grad_clip_norm": 10.0,
163
  "betas": [
164
  0.9,
165
  0.95
 
170
  "type": "cosine_decay_with_warmup",
171
  "num_warmup_steps": 1000,
172
  "num_decay_steps": 30000,
173
+ "peak_lr": 2.5e-05,
174
  "decay_lr": 2.5e-06
175
  },
176
  "eval": {
 
186
  "notes": null,
187
  "run_id": null,
188
  "mode": null
189
+ }
 
 
190
  }