Robotics
LeRobot
Safetensors
act
jjjeonghi commited on
Commit
6173095
·
verified ·
1 Parent(s): 221a110

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +5 -5
  2. config.json +23 -54
  3. model.safetensors +2 -2
  4. train_config.json +33 -70
README.md CHANGED
@@ -2,20 +2,20 @@
2
  datasets: data/banana_plate
3
  library_name: lerobot
4
  license: apache-2.0
5
- model_name: pi0_enhanced
6
  pipeline_tag: robotics
7
  tags:
8
- - robotics
9
  - lerobot
10
- - pi0_enhanced
11
  ---
12
 
13
- # Model Card for pi0_enhanced
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
- _Model type not recognized please update this template._
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
2
  datasets: data/banana_plate
3
  library_name: lerobot
4
  license: apache-2.0
5
+ model_name: act
6
  pipeline_tag: robotics
7
  tags:
8
+ - act
9
  - lerobot
10
+ - robotics
11
  ---
12
 
13
+ # Model Card for act
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
+ [Action Chunking with Transformers (ACT)](https://huggingface.co/papers/2304.13705) is an imitation-learning method that predicts short action chunks instead of single steps. It learns from teleoperated data and often achieves high success rates.
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "type": "pi0_enhanced",
3
  "n_obs_steps": 1,
4
  "input_features": {
5
  "observation.state": {
@@ -49,61 +49,30 @@
49
  "tags": null,
50
  "license": null,
51
  "pretrained_path": null,
52
- "paligemma_variant": "gemma_2b",
53
- "action_expert_variant": "gemma_300m",
54
- "dtype": "bfloat16",
55
- "chunk_size": 50,
56
- "n_action_steps": 50,
57
- "max_state_dim": 32,
58
- "max_action_dim": 32,
59
- "num_inference_steps": 10,
60
- "time_sampling_beta_alpha": 1.5,
61
- "time_sampling_beta_beta": 1.0,
62
- "time_sampling_scale": 0.999,
63
- "time_sampling_offset": 0.001,
64
- "min_period": 0.004,
65
- "max_period": 4.0,
66
- "image_resolution": [
67
- 224,
68
- 224
69
- ],
70
- "empty_cameras": 0,
71
- "freeze_vision_encoder": true,
72
- "msap_enabled": true,
73
- "msap_scales": [
74
- 1,
75
- 2,
76
- 4
77
- ],
78
- "msap_num_heads": 8,
79
- "msap_reduction_factor": 4,
80
- "scroi_enabled": true,
81
- "scroi_num_roi_queries": 8,
82
- "scroi_num_heads": 8,
83
- "had_enabled": true,
84
- "had_coarse_factor": 5,
85
- "had_num_transformer_layers": 2,
86
- "had_num_heads": 4,
87
- "had_use_auxiliary_loss": true,
88
- "had_auxiliary_loss_weight": 0.1,
89
  "normalization_mapping": {
90
- "VISUAL": "IDENTITY",
91
  "STATE": "MEAN_STD",
92
  "ACTION": "MEAN_STD"
93
  },
94
- "gradient_checkpointing": true,
95
- "compile_model": false,
96
- "compile_mode": "max-autotune",
97
- "optimizer_lr": 2.5e-05,
98
- "optimizer_betas": [
99
- 0.9,
100
- 0.95
101
- ],
102
- "optimizer_eps": 1e-08,
103
- "optimizer_weight_decay": 0.01,
104
- "optimizer_grad_clip_norm": 1.0,
105
- "scheduler_warmup_steps": 1000,
106
- "scheduler_decay_steps": 30000,
107
- "scheduler_decay_lr": 2.5e-06,
108
- "tokenizer_max_length": 48
 
 
 
 
109
  }
 
1
  {
2
+ "type": "act",
3
  "n_obs_steps": 1,
4
  "input_features": {
5
  "observation.state": {
 
49
  "tags": null,
50
  "license": null,
51
  "pretrained_path": null,
52
+ "chunk_size": 100,
53
+ "n_action_steps": 100,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  "normalization_mapping": {
55
+ "VISUAL": "MEAN_STD",
56
  "STATE": "MEAN_STD",
57
  "ACTION": "MEAN_STD"
58
  },
59
+ "vision_backbone": "resnet18",
60
+ "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
61
+ "replace_final_stride_with_dilation": false,
62
+ "pre_norm": false,
63
+ "dim_model": 512,
64
+ "n_heads": 8,
65
+ "dim_feedforward": 3200,
66
+ "feedforward_activation": "relu",
67
+ "n_encoder_layers": 4,
68
+ "n_decoder_layers": 1,
69
+ "use_vae": true,
70
+ "latent_dim": 32,
71
+ "n_vae_encoder_layers": 4,
72
+ "temporal_ensemble_coeff": null,
73
+ "dropout": 0.1,
74
+ "kl_weight": 10.0,
75
+ "optimizer_lr": 1e-05,
76
+ "optimizer_weight_decay": 0.0001,
77
+ "optimizer_lr_backbone": 1e-05
78
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf1758948fc91d6dfd1da0b25d252f1942421842bd20d4a6a1bceaa88f24e7ef
3
- size 7220225208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29cc908fd5d4d0a458a5fdd89888e0b61e035532db52691bdc599896762db3a2
3
+ size 206748912
train_config.json CHANGED
@@ -81,7 +81,7 @@
81
  },
82
  "env": null,
83
  "policy": {
84
- "type": "pi0_enhanced",
85
  "n_obs_steps": 1,
86
  "input_features": {
87
  "observation.state": {
@@ -131,71 +131,40 @@
131
  "tags": null,
132
  "license": null,
133
  "pretrained_path": null,
134
- "paligemma_variant": "gemma_2b",
135
- "action_expert_variant": "gemma_300m",
136
- "dtype": "bfloat16",
137
- "chunk_size": 50,
138
- "n_action_steps": 50,
139
- "max_state_dim": 32,
140
- "max_action_dim": 32,
141
- "num_inference_steps": 10,
142
- "time_sampling_beta_alpha": 1.5,
143
- "time_sampling_beta_beta": 1.0,
144
- "time_sampling_scale": 0.999,
145
- "time_sampling_offset": 0.001,
146
- "min_period": 0.004,
147
- "max_period": 4.0,
148
- "image_resolution": [
149
- 224,
150
- 224
151
- ],
152
- "empty_cameras": 0,
153
- "freeze_vision_encoder": true,
154
- "msap_enabled": true,
155
- "msap_scales": [
156
- 1,
157
- 2,
158
- 4
159
- ],
160
- "msap_num_heads": 8,
161
- "msap_reduction_factor": 4,
162
- "scroi_enabled": true,
163
- "scroi_num_roi_queries": 8,
164
- "scroi_num_heads": 8,
165
- "had_enabled": true,
166
- "had_coarse_factor": 5,
167
- "had_num_transformer_layers": 2,
168
- "had_num_heads": 4,
169
- "had_use_auxiliary_loss": true,
170
- "had_auxiliary_loss_weight": 0.1,
171
  "normalization_mapping": {
172
- "VISUAL": "IDENTITY",
173
  "STATE": "MEAN_STD",
174
  "ACTION": "MEAN_STD"
175
  },
176
- "gradient_checkpointing": true,
177
- "compile_model": false,
178
- "compile_mode": "max-autotune",
179
- "optimizer_lr": 2.5e-05,
180
- "optimizer_betas": [
181
- 0.9,
182
- 0.95
183
- ],
184
- "optimizer_eps": 1e-08,
185
- "optimizer_weight_decay": 0.01,
186
- "optimizer_grad_clip_norm": 1.0,
187
- "scheduler_warmup_steps": 1000,
188
- "scheduler_decay_steps": 30000,
189
- "scheduler_decay_lr": 2.5e-06,
190
- "tokenizer_max_length": 48
 
 
 
 
191
  },
192
- "output_dir": "outputs/test_pi0_enhanced",
193
- "job_name": "pi0_enhanced",
194
  "resume": false,
195
  "seed": 1000,
196
- "num_workers": 0,
197
- "batch_size": 1,
198
- "steps": 10,
199
  "eval_freq": 20000,
200
  "log_freq": 200,
201
  "save_checkpoint": true,
@@ -203,22 +172,16 @@
203
  "use_policy_training_preset": true,
204
  "optimizer": {
205
  "type": "adamw",
206
- "lr": 2.5e-05,
207
- "weight_decay": 0.01,
208
- "grad_clip_norm": 1.0,
209
  "betas": [
210
  0.9,
211
- 0.95
212
  ],
213
  "eps": 1e-08
214
  },
215
- "scheduler": {
216
- "type": "cosine_decay_with_warmup",
217
- "num_warmup_steps": 1000,
218
- "num_decay_steps": 30000,
219
- "peak_lr": 2.5e-05,
220
- "decay_lr": 2.5e-06
221
- },
222
  "eval": {
223
  "n_episodes": 50,
224
  "batch_size": 50,
 
81
  },
82
  "env": null,
83
  "policy": {
84
+ "type": "act",
85
  "n_obs_steps": 1,
86
  "input_features": {
87
  "observation.state": {
 
131
  "tags": null,
132
  "license": null,
133
  "pretrained_path": null,
134
+ "chunk_size": 100,
135
+ "n_action_steps": 100,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  "normalization_mapping": {
137
+ "VISUAL": "MEAN_STD",
138
  "STATE": "MEAN_STD",
139
  "ACTION": "MEAN_STD"
140
  },
141
+ "vision_backbone": "resnet18",
142
+ "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
143
+ "replace_final_stride_with_dilation": false,
144
+ "pre_norm": false,
145
+ "dim_model": 512,
146
+ "n_heads": 8,
147
+ "dim_feedforward": 3200,
148
+ "feedforward_activation": "relu",
149
+ "n_encoder_layers": 4,
150
+ "n_decoder_layers": 1,
151
+ "use_vae": true,
152
+ "latent_dim": 32,
153
+ "n_vae_encoder_layers": 4,
154
+ "temporal_ensemble_coeff": null,
155
+ "dropout": 0.1,
156
+ "kl_weight": 10.0,
157
+ "optimizer_lr": 1e-05,
158
+ "optimizer_weight_decay": 0.0001,
159
+ "optimizer_lr_backbone": 1e-05
160
  },
161
+ "output_dir": "/media/choi/HDD/outputs/train/ACT_enhanced_banana",
162
+ "job_name": "pi0_enhanced_banana",
163
  "resume": false,
164
  "seed": 1000,
165
+ "num_workers": 4,
166
+ "batch_size": 8,
167
+ "steps": 100000,
168
  "eval_freq": 20000,
169
  "log_freq": 200,
170
  "save_checkpoint": true,
 
172
  "use_policy_training_preset": true,
173
  "optimizer": {
174
  "type": "adamw",
175
+ "lr": 1e-05,
176
+ "weight_decay": 0.0001,
177
+ "grad_clip_norm": 10.0,
178
  "betas": [
179
  0.9,
180
+ 0.999
181
  ],
182
  "eps": 1e-08
183
  },
184
+ "scheduler": null,
 
 
 
 
 
 
185
  "eval": {
186
  "n_episodes": 50,
187
  "batch_size": 50,