thomas0829 commited on
Commit
f20b0c5
·
verified ·
1 Parent(s): 8772656

Upload policy with preprocessor, postprocessor, and model card

Browse files
README.md CHANGED
@@ -1,26 +1,38 @@
1
  ---
2
- datasets: thomas0829/bimanual_so100_grab
3
  library_name: lerobot
4
  license: apache-2.0
5
- model_name: act
6
  pipeline_tag: robotics
7
  tags:
8
- - robotics
9
  - lerobot
10
- - act
 
11
  ---
12
 
13
- # Model Card for act
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
 
 
 
17
 
18
- [Action Chunking with Transformers (ACT)](https://huggingface.co/papers/2304.13705) is an imitation-learning method that predicts short action chunks instead of single steps. It learns from teleoperated data and often achieves high success rates.
19
 
 
 
 
 
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
22
  See the full documentation at [LeRobot Docs](https://huggingface.co/docs/lerobot/index).
23
 
 
 
 
 
24
  ---
25
 
26
  ## How to Get Started with the Model
@@ -33,7 +45,7 @@ Below is the short version on how to train and run inference/eval:
33
  ```bash
34
  lerobot-train \
35
  --dataset.repo_id=${HF_USER}/<dataset> \
36
- --policy.type=act \
37
  --output_dir=outputs/train/<desired_policy_repo_id> \
38
  --job_name=lerobot_training \
39
  --policy.device=cuda \
@@ -56,7 +68,3 @@ lerobot-record \
56
  Prefix the dataset repo with **eval\_** and supply `--policy.path` pointing to a local or hub checkpoint.
57
 
58
  ---
59
-
60
- ## Model Details
61
-
62
- - **License:** apache-2.0
 
1
  ---
2
+ datasets: thomas0829/fold_the_towel
3
  library_name: lerobot
4
  license: apache-2.0
5
+ model_name: pi05
6
  pipeline_tag: robotics
7
  tags:
 
8
  - lerobot
9
+ - pi05
10
+ - robotics
11
  ---
12
 
13
+ # Model Card for pi05
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
+ **π₀.₅ (Pi05) Policy**
18
+
19
+ π₀.₅ (Pi05) Policy
20
 
21
+ π₀. is a Vision-Language-Action model with open-world generalization, from Physical Intelligence. The LeRobot implementation is adapted from their open source OpenPI repository.
22
 
23
+ **Model Overview**
24
+
25
+ π₀.₅ represents a significant evolution from π₀, developed by Physical Intelligence to address a big challenge in robotics: open-world generalization. While robots can perform impressive tasks in controlled environments, π₀.₅ is designed to generalize to entirely new environments and situations that were never seen during training.
26
+
27
+ For more details, see the [Physical Intelligence π₀.₅ blog post](https://www.physicalintelligence.company/blog/pi05).
28
 
29
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
30
  See the full documentation at [LeRobot Docs](https://huggingface.co/docs/lerobot/index).
31
 
32
+ ## Model Details
33
+
34
+ - **License:** apache-2.0
35
+
36
  ---
37
 
38
  ## How to Get Started with the Model
 
45
  ```bash
46
  lerobot-train \
47
  --dataset.repo_id=${HF_USER}/<dataset> \
48
+ --policy.type=pi05 \
49
  --output_dir=outputs/train/<desired_policy_repo_id> \
50
  --job_name=lerobot_training \
51
  --policy.device=cuda \
 
68
  Prefix the dataset repo with **eval\_** and supply `--policy.path` pointing to a local or hub checkpoint.
69
 
70
  ---
 
 
 
 
config.json CHANGED
@@ -1,35 +1,40 @@
1
  {
2
- "type": "act",
3
  "n_obs_steps": 1,
 
 
 
 
 
4
  "input_features": {
5
  "observation.state": {
6
  "type": "STATE",
7
  "shape": [
8
- 12
9
  ]
10
  },
11
  "observation.images.left": {
12
  "type": "VISUAL",
13
  "shape": [
14
  3,
15
- 480,
16
- 640
17
  ]
18
  },
19
- "observation.images.right": {
20
  "type": "VISUAL",
21
  "shape": [
22
  3,
23
- 480,
24
- 640
25
  ]
26
  },
27
- "observation.images.top": {
28
  "type": "VISUAL",
29
  "shape": [
30
  3,
31
- 720,
32
- 1280
33
  ]
34
  }
35
  },
@@ -37,42 +42,37 @@
37
  "action": {
38
  "type": "ACTION",
39
  "shape": [
40
- 12
41
  ]
42
  }
43
  },
44
  "device": "cuda",
45
  "use_amp": false,
46
  "push_to_hub": true,
47
- "repo_id": "thomas0829/policy_bimanual_grab",
48
- "private": null,
49
  "tags": null,
50
  "license": null,
51
- "pretrained_path": null,
52
- "chunk_size": 100,
53
- "n_action_steps": 100,
54
- "normalization_mapping": {
55
- "VISUAL": "MEAN_STD",
56
- "STATE": "MEAN_STD",
57
- "ACTION": "MEAN_STD"
58
- },
59
- "vision_backbone": "resnet18",
60
- "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
61
- "replace_final_stride_with_dilation": false,
62
- "pre_norm": false,
63
- "dim_model": 512,
64
- "n_heads": 8,
65
- "dim_feedforward": 3200,
66
- "feedforward_activation": "relu",
67
- "n_encoder_layers": 4,
68
- "n_decoder_layers": 1,
69
- "use_vae": true,
70
- "latent_dim": 32,
71
- "n_vae_encoder_layers": 4,
72
- "temporal_ensemble_coeff": null,
73
- "dropout": 0.1,
74
- "kl_weight": 10.0,
75
- "optimizer_lr": 1e-05,
76
- "optimizer_weight_decay": 0.0001,
77
- "optimizer_lr_backbone": 1e-05
78
  }
 
1
  {
2
+ "type": "pi05",
3
  "n_obs_steps": 1,
4
+ "normalization_mapping": {
5
+ "VISUAL": "IDENTITY",
6
+ "STATE": "QUANTILES",
7
+ "ACTION": "QUANTILES"
8
+ },
9
  "input_features": {
10
  "observation.state": {
11
  "type": "STATE",
12
  "shape": [
13
+ 14
14
  ]
15
  },
16
  "observation.images.left": {
17
  "type": "VISUAL",
18
  "shape": [
19
  3,
20
+ 224,
21
+ 224
22
  ]
23
  },
24
+ "observation.images.top": {
25
  "type": "VISUAL",
26
  "shape": [
27
  3,
28
+ 224,
29
+ 224
30
  ]
31
  },
32
+ "observation.images.right": {
33
  "type": "VISUAL",
34
  "shape": [
35
  3,
36
+ 224,
37
+ 224
38
  ]
39
  }
40
  },
 
42
  "action": {
43
  "type": "ACTION",
44
  "shape": [
45
+ 14
46
  ]
47
  }
48
  },
49
  "device": "cuda",
50
  "use_amp": false,
51
  "push_to_hub": true,
52
+ "repo_id": "thomas0829/test",
53
+ "private": false,
54
  "tags": null,
55
  "license": null,
56
+ "pretrained_path": "lerobot/pi05_base",
57
+ "paligemma_variant": "gemma_2b",
58
+ "action_expert_variant": "gemma_300m",
59
+ "dtype": "bfloat16",
60
+ "chunk_size": 50,
61
+ "n_action_steps": 50,
62
+ "max_state_dim": 32,
63
+ "max_action_dim": 32,
64
+ "num_inference_steps": 10,
65
+ "time_sampling_beta_alpha": 1.5,
66
+ "time_sampling_beta_beta": 1.0,
67
+ "time_sampling_scale": 0.999,
68
+ "time_sampling_offset": 0.001,
69
+ "min_period": 0.004,
70
+ "max_period": 4.0,
71
+ "rtc_config": null,
72
+ "image_resolution": [
73
+ 224,
74
+ 224
75
+ ],
76
+ "empty_cameras": 0,
77
+ "tokenizer_max_length": 200
 
 
 
 
 
78
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc84041cff73193b00b49280406ac9cf798d2a6dcc3cfa62ade5af83621b18d8
3
- size 206748912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fda3408e745af5826c58b556695f784f3d7090ebfcd90a498aaaed583e1e10d
3
+ size 7473096344
policy_postprocessor.json CHANGED
@@ -9,14 +9,14 @@
9
  "action": {
10
  "type": "ACTION",
11
  "shape": [
12
- 12
13
  ]
14
  }
15
  },
16
  "norm_map": {
17
- "VISUAL": "MEAN_STD",
18
- "STATE": "MEAN_STD",
19
- "ACTION": "MEAN_STD"
20
  }
21
  },
22
  "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors"
 
9
  "action": {
10
  "type": "ACTION",
11
  "shape": [
12
+ 14
13
  ]
14
  }
15
  },
16
  "norm_map": {
17
+ "VISUAL": "IDENTITY",
18
+ "STATE": "QUANTILES",
19
+ "ACTION": "QUANTILES"
20
  }
21
  },
22
  "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors"
policy_postprocessor_step_0_unnormalizer_processor.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a48c086134112145d831b7dd8ec6edddf16cb2ef39a97ca83c408d802c078d94
3
- size 9024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74f49c612d42e9bf57d8cf688169046fd01c1078e58c30bd18daaa4fb42d7a22
3
+ size 9400
policy_preprocessor.json CHANGED
@@ -11,13 +11,6 @@
11
  "registry_name": "to_batch_processor",
12
  "config": {}
13
  },
14
- {
15
- "registry_name": "device_processor",
16
- "config": {
17
- "device": "cuda",
18
- "float_dtype": null
19
- }
20
- },
21
  {
22
  "registry_name": "normalizer_processor",
23
  "config": {
@@ -26,47 +19,69 @@
26
  "observation.state": {
27
  "type": "STATE",
28
  "shape": [
29
- 12
30
  ]
31
  },
32
  "observation.images.left": {
33
  "type": "VISUAL",
34
  "shape": [
35
  3,
36
- 480,
37
- 640
38
  ]
39
  },
40
- "observation.images.right": {
41
  "type": "VISUAL",
42
  "shape": [
43
  3,
44
- 480,
45
- 640
46
  ]
47
  },
48
- "observation.images.top": {
49
  "type": "VISUAL",
50
  "shape": [
51
  3,
52
- 720,
53
- 1280
54
  ]
55
  },
56
  "action": {
57
  "type": "ACTION",
58
  "shape": [
59
- 12
60
  ]
61
  }
62
  },
63
  "norm_map": {
64
- "VISUAL": "MEAN_STD",
65
- "STATE": "MEAN_STD",
66
- "ACTION": "MEAN_STD"
67
  }
68
  },
69
- "state_file": "policy_preprocessor_step_3_normalizer_processor.safetensors"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  }
71
  ]
72
  }
 
11
  "registry_name": "to_batch_processor",
12
  "config": {}
13
  },
 
 
 
 
 
 
 
14
  {
15
  "registry_name": "normalizer_processor",
16
  "config": {
 
19
  "observation.state": {
20
  "type": "STATE",
21
  "shape": [
22
+ 14
23
  ]
24
  },
25
  "observation.images.left": {
26
  "type": "VISUAL",
27
  "shape": [
28
  3,
29
+ 224,
30
+ 224
31
  ]
32
  },
33
+ "observation.images.top": {
34
  "type": "VISUAL",
35
  "shape": [
36
  3,
37
+ 224,
38
+ 224
39
  ]
40
  },
41
+ "observation.images.right": {
42
  "type": "VISUAL",
43
  "shape": [
44
  3,
45
+ 224,
46
+ 224
47
  ]
48
  },
49
  "action": {
50
  "type": "ACTION",
51
  "shape": [
52
+ 14
53
  ]
54
  }
55
  },
56
  "norm_map": {
57
+ "VISUAL": "IDENTITY",
58
+ "STATE": "QUANTILES",
59
+ "ACTION": "QUANTILES"
60
  }
61
  },
62
+ "state_file": "policy_preprocessor_step_2_normalizer_processor.safetensors"
63
+ },
64
+ {
65
+ "registry_name": "pi05_prepare_state_tokenizer_processor_step",
66
+ "config": {}
67
+ },
68
+ {
69
+ "registry_name": "tokenizer_processor",
70
+ "config": {
71
+ "max_length": 200,
72
+ "task_key": "task",
73
+ "padding_side": "right",
74
+ "padding": "max_length",
75
+ "truncation": true,
76
+ "tokenizer_name": "google/paligemma-3b-pt-224"
77
+ }
78
+ },
79
+ {
80
+ "registry_name": "device_processor",
81
+ "config": {
82
+ "device": "cuda",
83
+ "float_dtype": null
84
+ }
85
  }
86
  ]
87
  }
policy_preprocessor_step_2_normalizer_processor.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74f49c612d42e9bf57d8cf688169046fd01c1078e58c30bd18daaa4fb42d7a22
3
+ size 9400
train_config.json CHANGED
@@ -1,10 +1,11 @@
1
  {
2
  "dataset": {
3
- "repo_id": "thomas0829/bimanual_so100_grab",
 
4
  "root": null,
5
  "episodes": null,
6
  "image_transforms": {
7
- "enable": false,
8
  "max_num_transforms": 3,
9
  "random_order": false,
10
  "tfs": {
@@ -57,47 +58,68 @@
57
  1.5
58
  ]
59
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  }
61
  }
62
  },
63
  "revision": null,
64
  "use_imagenet_stats": true,
65
  "video_backend": "torchcodec",
66
- "streaming": false
 
67
  },
 
68
  "env": null,
69
  "policy": {
70
- "type": "act",
71
  "n_obs_steps": 1,
 
 
 
 
 
72
  "input_features": {
73
  "observation.state": {
74
  "type": "STATE",
75
  "shape": [
76
- 12
77
  ]
78
  },
79
  "observation.images.left": {
80
  "type": "VISUAL",
81
  "shape": [
82
  3,
83
- 480,
84
- 640
85
  ]
86
  },
87
- "observation.images.right": {
88
  "type": "VISUAL",
89
  "shape": [
90
  3,
91
- 480,
92
- 640
93
  ]
94
  },
95
- "observation.images.top": {
96
  "type": "VISUAL",
97
  "shape": [
98
  3,
99
- 720,
100
- 1280
101
  ]
102
  }
103
  },
@@ -105,69 +127,98 @@
105
  "action": {
106
  "type": "ACTION",
107
  "shape": [
108
- 12
109
  ]
110
  }
111
  },
112
  "device": "cuda",
113
  "use_amp": false,
 
114
  "push_to_hub": true,
115
- "repo_id": "thomas0829/policy_bimanual_grab",
116
- "private": null,
117
  "tags": null,
118
  "license": null,
119
- "pretrained_path": null,
120
- "chunk_size": 100,
121
- "n_action_steps": 100,
122
- "normalization_mapping": {
123
- "VISUAL": "MEAN_STD",
124
- "STATE": "MEAN_STD",
125
- "ACTION": "MEAN_STD"
126
- },
127
- "vision_backbone": "resnet18",
128
- "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
129
- "replace_final_stride_with_dilation": false,
130
- "pre_norm": false,
131
- "dim_model": 512,
132
- "n_heads": 8,
133
- "dim_feedforward": 3200,
134
- "feedforward_activation": "relu",
135
- "n_encoder_layers": 4,
136
- "n_decoder_layers": 1,
137
- "use_vae": true,
138
- "latent_dim": 32,
139
- "n_vae_encoder_layers": 4,
140
- "temporal_ensemble_coeff": null,
141
- "dropout": 0.1,
142
- "kl_weight": 10.0,
143
- "optimizer_lr": 1e-05,
144
- "optimizer_weight_decay": 0.0001,
145
- "optimizer_lr_backbone": 1e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  },
147
- "output_dir": "outputs/train/bimanual_act_so100_grab",
148
- "job_name": "bimanual_act_so100_grab",
 
 
 
149
  "resume": false,
150
- "seed": 1000,
 
151
  "num_workers": 4,
152
- "batch_size": 4,
153
- "steps": 10000,
154
- "eval_freq": 1000,
155
- "log_freq": 200,
 
156
  "save_checkpoint": true,
157
- "save_freq": 2500,
 
 
158
  "use_policy_training_preset": true,
159
  "optimizer": {
160
  "type": "adamw",
161
- "lr": 1e-05,
162
- "weight_decay": 0.0001,
163
- "grad_clip_norm": 10.0,
164
  "betas": [
165
  0.9,
166
- 0.999
167
  ],
168
  "eps": 1e-08
169
  },
170
- "scheduler": null,
 
 
 
 
 
 
171
  "eval": {
172
  "n_episodes": 50,
173
  "batch_size": 50,
@@ -175,11 +226,19 @@
175
  },
176
  "wandb": {
177
  "enable": true,
178
- "disable_artifact": false,
179
- "project": "lerobot",
180
  "entity": null,
181
- "notes": null,
182
- "run_id": "5spc6953",
183
  "mode": null
 
 
 
 
 
 
 
 
184
  }
185
  }
 
1
  {
2
  "dataset": {
3
+ "repo_id": "thomas0829/fold_the_towel",
4
+ "repo_ids": null,
5
  "root": null,
6
  "episodes": null,
7
  "image_transforms": {
8
+ "enable": true,
9
  "max_num_transforms": 3,
10
  "random_order": false,
11
  "tfs": {
 
58
  1.5
59
  ]
60
  }
61
+ },
62
+ "affine": {
63
+ "weight": 1.0,
64
+ "type": "RandomAffine",
65
+ "kwargs": {
66
+ "degrees": [
67
+ -5.0,
68
+ 5.0
69
+ ],
70
+ "translate": [
71
+ 0.05,
72
+ 0.05
73
+ ]
74
+ }
75
  }
76
  }
77
  },
78
  "revision": null,
79
  "use_imagenet_stats": true,
80
  "video_backend": "torchcodec",
81
+ "force_cache_sync": false,
82
+ "use_annotated_tasks": false
83
  },
84
+ "num_datasets": 100,
85
  "env": null,
86
  "policy": {
87
+ "type": "pi05",
88
  "n_obs_steps": 1,
89
+ "normalization_mapping": {
90
+ "VISUAL": "IDENTITY",
91
+ "STATE": "QUANTILES",
92
+ "ACTION": "QUANTILES"
93
+ },
94
  "input_features": {
95
  "observation.state": {
96
  "type": "STATE",
97
  "shape": [
98
+ 14
99
  ]
100
  },
101
  "observation.images.left": {
102
  "type": "VISUAL",
103
  "shape": [
104
  3,
105
+ 224,
106
+ 224
107
  ]
108
  },
109
+ "observation.images.top": {
110
  "type": "VISUAL",
111
  "shape": [
112
  3,
113
+ 224,
114
+ 224
115
  ]
116
  },
117
+ "observation.images.right": {
118
  "type": "VISUAL",
119
  "shape": [
120
  3,
121
+ 224,
122
+ 224
123
  ]
124
  }
125
  },
 
127
  "action": {
128
  "type": "ACTION",
129
  "shape": [
130
+ 14
131
  ]
132
  }
133
  },
134
  "device": "cuda",
135
  "use_amp": false,
136
+ "compiled": false,
137
  "push_to_hub": true,
138
+ "repo_id": "thomas0829/test",
139
+ "private": false,
140
  "tags": null,
141
  "license": null,
142
+ "pretrained_path": "lerobot/pi05_base",
143
+ "paligemma_variant": "gemma_2b",
144
+ "action_expert_variant": "gemma_300m",
145
+ "dtype": "bfloat16",
146
+ "chunk_size": 50,
147
+ "n_action_steps": 50,
148
+ "max_state_dim": 32,
149
+ "max_action_dim": 32,
150
+ "num_inference_steps": 10,
151
+ "time_sampling_beta_alpha": 1.5,
152
+ "time_sampling_beta_beta": 1.0,
153
+ "time_sampling_scale": 0.999,
154
+ "time_sampling_offset": 0.001,
155
+ "min_period": 0.004,
156
+ "max_period": 4.0,
157
+ "rtc_config": null,
158
+ "image_resolution": [
159
+ 224,
160
+ 224
161
+ ],
162
+ "empty_cameras": 0,
163
+ "tokenizer_max_length": 200,
164
+ "gradient_checkpointing": true,
165
+ "compile_model": false,
166
+ "compile_mode": "max-autotune",
167
+ "attention_implementation": "eager",
168
+ "use_lora": false,
169
+ "lora_rank": 16,
170
+ "lora_alpha": 32.0,
171
+ "lora_dropout": 0.1,
172
+ "lora_target_modules": null,
173
+ "optimizer_lr": 2.5e-05,
174
+ "optimizer_betas": [
175
+ 0.9,
176
+ 0.95
177
+ ],
178
+ "optimizer_eps": 1e-08,
179
+ "optimizer_weight_decay": 0.01,
180
+ "optimizer_grad_clip_norm": 1.0,
181
+ "scheduler_warmup_steps": 1000,
182
+ "scheduler_decay_steps": 30000,
183
+ "scheduler_decay_lr": 1e-05
184
  },
185
+ "compile": false,
186
+ "strict": true,
187
+ "loss_threshold": 3.0,
188
+ "output_dir": "outputs/train/2026-02-25/15-59-03_pi05_training",
189
+ "job_name": "pi05_training",
190
  "resume": false,
191
+ "resume_scheduler": true,
192
+ "seed": 3407,
193
  "num_workers": 4,
194
+ "batch_size": 1,
195
+ "gradient_accumulation_steps": 2,
196
+ "steps": 10,
197
+ "eval_freq": 20000,
198
+ "log_freq": 10,
199
  "save_checkpoint": true,
200
+ "push_to_hub": false,
201
+ "repo_id": null,
202
+ "save_freq": 5000,
203
  "use_policy_training_preset": true,
204
  "optimizer": {
205
  "type": "adamw",
206
+ "lr": 2.5e-05,
207
+ "weight_decay": 0.01,
208
+ "grad_clip_norm": 1.0,
209
  "betas": [
210
  0.9,
211
+ 0.95
212
  ],
213
  "eps": 1e-08
214
  },
215
+ "scheduler": {
216
+ "type": "cosine_decay_with_warmup",
217
+ "num_warmup_steps": 1000,
218
+ "num_decay_steps": 30000,
219
+ "peak_lr": 2.5e-05,
220
+ "decay_lr": 1e-05
221
+ },
222
  "eval": {
223
  "n_episodes": 50,
224
  "batch_size": 50,
 
226
  },
227
  "wandb": {
228
  "enable": true,
229
+ "disable_artifact": true,
230
+ "project": "yam-pi05-finetune",
231
  "entity": null,
232
+ "notes": "Full fine-tuning of pi05 on put_the_dolls_on_the_cloth dataset",
233
+ "run_id": null,
234
  "mode": null
235
+ },
236
+ "test_dataloader": false,
237
+ "num_epochs": 1,
238
+ "ddp_timeout_s": 6000,
239
+ "rename_map": {
240
+ "observation.images.front_camera": "observation.images.top",
241
+ "observation.images.left_camera": "observation.images.left",
242
+ "observation.images.right_camera": "observation.images.right"
243
  }
244
  }