zeeshaan-ai commited on
Commit
fdef9b5
·
verified ·
1 Parent(s): 1597161

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +13 -7
  2. config.json +28 -34
  3. model.safetensors +2 -2
  4. train_config.json +34 -40
README.md CHANGED
@@ -1,23 +1,29 @@
1
  ---
2
- base_model: lerobot/smolvla_base
3
  datasets: GetSoloTech/FoodStack
4
  library_name: lerobot
5
  license: apache-2.0
6
- model_name: smolvla
7
  pipeline_tag: robotics
8
  tags:
9
- - lerobot
10
  - robotics
11
- - smolvla
12
- - unsloth
13
  ---
14
 
15
- # Model Card for smolvla
16
 
17
  <!-- Provide a quick summary of what the model is/does. -->
18
 
19
 
20
- [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
 
 
 
 
 
 
 
 
21
 
22
 
23
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
1
  ---
 
2
  datasets: GetSoloTech/FoodStack
3
  library_name: lerobot
4
  license: apache-2.0
5
+ model_name: pi05
6
  pipeline_tag: robotics
7
  tags:
8
+ - pi05
9
  - robotics
10
+ - lerobot
 
11
  ---
12
 
13
+ # Model Card for pi05
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
+ **π₀.₅ (Pi05) Policy**
19
+
20
+ π₀.₅ is a Vision-Language-Action model with open-world generalization, from Physical Intelligence. The LeRobot implementation is adapted from their open source OpenPI repository.
21
+
22
+ **Model Overview**
23
+
24
+ π₀.₅ represents a significant evolution from π₀, developed by Physical Intelligence to address a big challenge in robotics: open-world generalization. While robots can perform impressive tasks in controlled environments, π₀.₅ is designed to generalize to entirely new environments and situations that were never seen during training.
25
+
26
+ For more details, see the [Physical Intelligence π₀.₅ blog post](https://www.physicalintelligence.company/blog/pi05).
27
 
28
 
29
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "type": "smolvla",
3
  "n_obs_steps": 1,
4
  "input_features": {
5
  "observation.state": {
@@ -40,50 +40,44 @@
40
  "private": null,
41
  "tags": null,
42
  "license": null,
43
- "pretrained_path": "lerobot/smolvla_base",
 
 
 
44
  "chunk_size": 50,
45
  "n_action_steps": 50,
46
- "normalization_mapping": {
47
- "VISUAL": "IDENTITY",
48
- "STATE": "MEAN_STD",
49
- "ACTION": "MEAN_STD"
50
- },
51
  "max_state_dim": 32,
52
  "max_action_dim": 32,
53
- "resize_imgs_with_padding": [
54
- 512,
55
- 512
 
 
 
 
 
 
 
56
  ],
57
  "empty_cameras": 0,
58
- "adapt_to_pi_aloha": false,
59
- "use_delta_joint_actions_aloha": false,
60
- "tokenizer_max_length": 48,
61
- "num_steps": 10,
62
- "use_cache": true,
63
- "freeze_vision_encoder": true,
64
- "train_expert_only": true,
65
- "train_state_proj": true,
66
- "optimizer_lr": 0.0001,
 
67
  "optimizer_betas": [
68
  0.9,
69
  0.95
70
  ],
71
  "optimizer_eps": 1e-08,
72
- "optimizer_weight_decay": 1e-10,
73
- "optimizer_grad_clip_norm": 10,
74
  "scheduler_warmup_steps": 1000,
75
  "scheduler_decay_steps": 30000,
76
- "scheduler_decay_lr": 2.5e-06,
77
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
78
- "load_vlm_weights": false,
79
- "add_image_special_tokens": false,
80
- "attention_mode": "cross_attn",
81
- "prefix_length": -1,
82
- "pad_language_to": "longest",
83
- "num_expert_layers": -1,
84
- "num_vlm_layers": 16,
85
- "self_attn_every_n_layers": 2,
86
- "expert_width_multiplier": 0.75,
87
- "min_period": 0.004,
88
- "max_period": 4.0
89
  }
 
1
  {
2
+ "type": "pi05",
3
  "n_obs_steps": 1,
4
  "input_features": {
5
  "observation.state": {
 
40
  "private": null,
41
  "tags": null,
42
  "license": null,
43
+ "pretrained_path": "lerobot/pi05_base",
44
+ "paligemma_variant": "gemma_2b",
45
+ "action_expert_variant": "gemma_300m",
46
+ "dtype": "bfloat16",
47
  "chunk_size": 50,
48
  "n_action_steps": 50,
 
 
 
 
 
49
  "max_state_dim": 32,
50
  "max_action_dim": 32,
51
+ "num_inference_steps": 10,
52
+ "time_sampling_beta_alpha": 1.5,
53
+ "time_sampling_beta_beta": 1.0,
54
+ "time_sampling_scale": 0.999,
55
+ "time_sampling_offset": 0.001,
56
+ "min_period": 0.004,
57
+ "max_period": 4.0,
58
+ "image_resolution": [
59
+ 224,
60
+ 224
61
  ],
62
  "empty_cameras": 0,
63
+ "tokenizer_max_length": 200,
64
+ "normalization_mapping": {
65
+ "ACTION": "MEAN_STD",
66
+ "STATE": "MEAN_STD",
67
+ "VISUAL": "IDENTITY"
68
+ },
69
+ "gradient_checkpointing": true,
70
+ "compile_model": true,
71
+ "compile_mode": "max-autotune",
72
+ "optimizer_lr": 2.5e-05,
73
  "optimizer_betas": [
74
  0.9,
75
  0.95
76
  ],
77
  "optimizer_eps": 1e-08,
78
+ "optimizer_weight_decay": 0.01,
79
+ "optimizer_grad_clip_norm": 1.0,
80
  "scheduler_warmup_steps": 1000,
81
  "scheduler_decay_steps": 30000,
82
+ "scheduler_decay_lr": 2.5e-06
 
 
 
 
 
 
 
 
 
 
 
 
83
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abf6d4335765e784ba6ffb392d36e51cb0f97de44327ec6d1b351d5c5c4919b7
3
- size 1197789224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e9f0192d2f24b447d95d1ec4e23b8da5e8972faae87207c0e7e09af9e094b56
3
+ size 7473096344
train_config.json CHANGED
@@ -81,7 +81,7 @@
81
  },
82
  "env": null,
83
  "policy": {
84
- "type": "smolvla",
85
  "n_obs_steps": 1,
86
  "input_features": {
87
  "observation.state": {
@@ -122,55 +122,49 @@
122
  "private": null,
123
  "tags": null,
124
  "license": null,
125
- "pretrained_path": "lerobot/smolvla_base",
 
 
 
126
  "chunk_size": 50,
127
  "n_action_steps": 50,
128
- "normalization_mapping": {
129
- "VISUAL": "IDENTITY",
130
- "STATE": "MEAN_STD",
131
- "ACTION": "MEAN_STD"
132
- },
133
  "max_state_dim": 32,
134
  "max_action_dim": 32,
135
- "resize_imgs_with_padding": [
136
- 512,
137
- 512
 
 
 
 
 
 
 
138
  ],
139
  "empty_cameras": 0,
140
- "adapt_to_pi_aloha": false,
141
- "use_delta_joint_actions_aloha": false,
142
- "tokenizer_max_length": 48,
143
- "num_steps": 10,
144
- "use_cache": true,
145
- "freeze_vision_encoder": true,
146
- "train_expert_only": true,
147
- "train_state_proj": true,
148
- "optimizer_lr": 0.0001,
 
149
  "optimizer_betas": [
150
  0.9,
151
  0.95
152
  ],
153
  "optimizer_eps": 1e-08,
154
- "optimizer_weight_decay": 1e-10,
155
- "optimizer_grad_clip_norm": 10,
156
  "scheduler_warmup_steps": 1000,
157
  "scheduler_decay_steps": 30000,
158
- "scheduler_decay_lr": 2.5e-06,
159
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
160
- "load_vlm_weights": false,
161
- "add_image_special_tokens": false,
162
- "attention_mode": "cross_attn",
163
- "prefix_length": -1,
164
- "pad_language_to": "longest",
165
- "num_expert_layers": -1,
166
- "num_vlm_layers": 16,
167
- "self_attn_every_n_layers": 2,
168
- "expert_width_multiplier": 0.75,
169
- "min_period": 0.004,
170
- "max_period": 4.0
171
  },
172
  "output_dir": "/workspace/outputs",
173
- "job_name": "uhyugtu",
174
  "resume": false,
175
  "seed": 1000,
176
  "num_workers": 4,
@@ -179,13 +173,13 @@
179
  "eval_freq": 20000,
180
  "log_freq": 200,
181
  "save_checkpoint": true,
182
- "save_freq": 3000,
183
  "use_policy_training_preset": true,
184
  "optimizer": {
185
  "type": "adamw",
186
- "lr": 0.0001,
187
- "weight_decay": 1e-10,
188
- "grad_clip_norm": 10,
189
  "betas": [
190
  0.9,
191
  0.95
@@ -196,7 +190,7 @@
196
  "type": "cosine_decay_with_warmup",
197
  "num_warmup_steps": 1000,
198
  "num_decay_steps": 30000,
199
- "peak_lr": 0.0001,
200
  "decay_lr": 2.5e-06
201
  },
202
  "eval": {
 
81
  },
82
  "env": null,
83
  "policy": {
84
+ "type": "pi05",
85
  "n_obs_steps": 1,
86
  "input_features": {
87
  "observation.state": {
 
122
  "private": null,
123
  "tags": null,
124
  "license": null,
125
+ "pretrained_path": "lerobot/pi05_base",
126
+ "paligemma_variant": "gemma_2b",
127
+ "action_expert_variant": "gemma_300m",
128
+ "dtype": "bfloat16",
129
  "chunk_size": 50,
130
  "n_action_steps": 50,
 
 
 
 
 
131
  "max_state_dim": 32,
132
  "max_action_dim": 32,
133
+ "num_inference_steps": 10,
134
+ "time_sampling_beta_alpha": 1.5,
135
+ "time_sampling_beta_beta": 1.0,
136
+ "time_sampling_scale": 0.999,
137
+ "time_sampling_offset": 0.001,
138
+ "min_period": 0.004,
139
+ "max_period": 4.0,
140
+ "image_resolution": [
141
+ 224,
142
+ 224
143
  ],
144
  "empty_cameras": 0,
145
+ "tokenizer_max_length": 200,
146
+ "normalization_mapping": {
147
+ "ACTION": "MEAN_STD",
148
+ "STATE": "MEAN_STD",
149
+ "VISUAL": "IDENTITY"
150
+ },
151
+ "gradient_checkpointing": true,
152
+ "compile_model": true,
153
+ "compile_mode": "max-autotune",
154
+ "optimizer_lr": 2.5e-05,
155
  "optimizer_betas": [
156
  0.9,
157
  0.95
158
  ],
159
  "optimizer_eps": 1e-08,
160
+ "optimizer_weight_decay": 0.01,
161
+ "optimizer_grad_clip_norm": 1.0,
162
  "scheduler_warmup_steps": 1000,
163
  "scheduler_decay_steps": 30000,
164
+ "scheduler_decay_lr": 2.5e-06
 
 
 
 
 
 
 
 
 
 
 
 
165
  },
166
  "output_dir": "/workspace/outputs",
167
+ "job_name": "qqqqqqqqqqqqqqqqqvlaaa",
168
  "resume": false,
169
  "seed": 1000,
170
  "num_workers": 4,
 
173
  "eval_freq": 20000,
174
  "log_freq": 200,
175
  "save_checkpoint": true,
176
+ "save_freq": 1000,
177
  "use_policy_training_preset": true,
178
  "optimizer": {
179
  "type": "adamw",
180
+ "lr": 2.5e-05,
181
+ "weight_decay": 0.01,
182
+ "grad_clip_norm": 1.0,
183
  "betas": [
184
  0.9,
185
  0.95
 
190
  "type": "cosine_decay_with_warmup",
191
  "num_warmup_steps": 1000,
192
  "num_decay_steps": 30000,
193
+ "peak_lr": 2.5e-05,
194
  "decay_lr": 2.5e-06
195
  },
196
  "eval": {