Robotics
LeRobot
Safetensors
act
autel123 commited on
Commit
6f77b1e
·
verified ·
1 Parent(s): 4c2da24

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +6 -7
  2. config.json +27 -54
  3. model.safetensors +2 -2
  4. train_config.json +72 -70
README.md CHANGED
@@ -1,22 +1,21 @@
1
  ---
2
- base_model: lerobot/smolvla_base
3
- datasets: lerobot/svla_so100_stacking
4
  library_name: lerobot
5
  license: apache-2.0
6
- model_name: smolvla
7
  pipeline_tag: robotics
8
  tags:
9
- - robotics
10
- - smolvla
11
  - lerobot
 
12
  ---
13
 
14
- # Model Card for smolvla
15
 
16
  <!-- Provide a quick summary of what the model is/does. -->
17
 
18
 
19
- [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
 
21
 
22
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
1
  ---
2
+ datasets: lerobot/aloha_sim_insertion_human
 
3
  library_name: lerobot
4
  license: apache-2.0
5
+ model_name: act
6
  pipeline_tag: robotics
7
  tags:
8
+ - act
 
9
  - lerobot
10
+ - robotics
11
  ---
12
 
13
+ # Model Card for act
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
+ [Action Chunking with Transformers (ACT)](https://huggingface.co/papers/2304.13705) is an imitation-learning method that predicts short action chunks instead of single steps. It learns from teleoperated data and often achieves high success rates.
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
config.json CHANGED
@@ -1,18 +1,12 @@
1
  {
2
- "type": "smolvla",
3
  "n_obs_steps": 1,
4
  "normalization_mapping": {
5
- "VISUAL": "IDENTITY",
6
  "STATE": "MEAN_STD",
7
  "ACTION": "MEAN_STD"
8
  },
9
  "input_features": {
10
- "observation.state": {
11
- "type": "STATE",
12
- "shape": [
13
- 6
14
- ]
15
- },
16
  "observation.images.top": {
17
  "type": "VISUAL",
18
  "shape": [
@@ -21,12 +15,10 @@
21
  640
22
  ]
23
  },
24
- "observation.images.wrist": {
25
- "type": "VISUAL",
26
  "shape": [
27
- 3,
28
- 480,
29
- 640
30
  ]
31
  }
32
  },
@@ -34,7 +26,7 @@
34
  "action": {
35
  "type": "ACTION",
36
  "shape": [
37
- 6
38
  ]
39
  }
40
  },
@@ -45,44 +37,25 @@
45
  "private": null,
46
  "tags": null,
47
  "license": null,
48
- "chunk_size": 50,
49
- "n_action_steps": 50,
50
- "max_state_dim": 32,
51
- "max_action_dim": 32,
52
- "resize_imgs_with_padding": [
53
- 512,
54
- 512
55
- ],
56
- "empty_cameras": 0,
57
- "adapt_to_pi_aloha": false,
58
- "use_delta_joint_actions_aloha": false,
59
- "tokenizer_max_length": 48,
60
- "num_steps": 10,
61
- "use_cache": true,
62
- "freeze_vision_encoder": true,
63
- "train_expert_only": true,
64
- "train_state_proj": true,
65
- "optimizer_lr": 0.0001,
66
- "optimizer_betas": [
67
- 0.9,
68
- 0.95
69
- ],
70
- "optimizer_eps": 1e-08,
71
- "optimizer_weight_decay": 1e-10,
72
- "optimizer_grad_clip_norm": 10.0,
73
- "scheduler_warmup_steps": 1000,
74
- "scheduler_decay_steps": 30000,
75
- "scheduler_decay_lr": 2.5e-06,
76
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
77
- "load_vlm_weights": true,
78
- "add_image_special_tokens": false,
79
- "attention_mode": "cross_attn",
80
- "prefix_length": 0,
81
- "pad_language_to": "max_length",
82
- "num_expert_layers": 0,
83
- "num_vlm_layers": 16,
84
- "self_attn_every_n_layers": 2,
85
- "expert_width_multiplier": 0.75,
86
- "min_period": 0.004,
87
- "max_period": 4.0
88
  }
 
1
  {
2
+ "type": "act",
3
  "n_obs_steps": 1,
4
  "normalization_mapping": {
5
+ "VISUAL": "MEAN_STD",
6
  "STATE": "MEAN_STD",
7
  "ACTION": "MEAN_STD"
8
  },
9
  "input_features": {
 
 
 
 
 
 
10
  "observation.images.top": {
11
  "type": "VISUAL",
12
  "shape": [
 
15
  640
16
  ]
17
  },
18
+ "observation.state": {
19
+ "type": "STATE",
20
  "shape": [
21
+ 14
 
 
22
  ]
23
  }
24
  },
 
26
  "action": {
27
  "type": "ACTION",
28
  "shape": [
29
+ 14
30
  ]
31
  }
32
  },
 
37
  "private": null,
38
  "tags": null,
39
  "license": null,
40
+ "chunk_size": 100,
41
+ "n_action_steps": 100,
42
+ "vision_backbone": "resnet18",
43
+ "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
44
+ "replace_final_stride_with_dilation": false,
45
+ "pre_norm": false,
46
+ "dim_model": 512,
47
+ "n_heads": 8,
48
+ "dim_feedforward": 3200,
49
+ "feedforward_activation": "relu",
50
+ "n_encoder_layers": 4,
51
+ "n_decoder_layers": 1,
52
+ "use_vae": true,
53
+ "latent_dim": 32,
54
+ "n_vae_encoder_layers": 4,
55
+ "temporal_ensemble_coeff": null,
56
+ "dropout": 0.1,
57
+ "kl_weight": 10.0,
58
+ "optimizer_lr": 1e-05,
59
+ "optimizer_weight_decay": 0.0001,
60
+ "optimizer_lr_backbone": 1e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56e643a86156b47c809ddf091c3552dacc0b049a9c427d3f14af79ea70974d0d
3
- size 906713296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:928a7d6b23400e62c28416e66ae2b8678ddbd47b335a6d9e18ddb02b4ca7dbb4
3
+ size 206766560
train_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "dataset": {
3
- "repo_id": "lerobot/svla_so100_stacking",
4
  "root": null,
5
  "episodes": null,
6
  "image_transforms": {
@@ -64,22 +64,51 @@
64
  "use_imagenet_stats": true,
65
  "video_backend": "pyav"
66
  },
67
- "env": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  "policy": {
69
- "type": "smolvla",
70
  "n_obs_steps": 1,
71
  "normalization_mapping": {
72
- "VISUAL": "IDENTITY",
73
  "STATE": "MEAN_STD",
74
  "ACTION": "MEAN_STD"
75
  },
76
  "input_features": {
77
- "observation.state": {
78
- "type": "STATE",
79
- "shape": [
80
- 6
81
- ]
82
- },
83
  "observation.images.top": {
84
  "type": "VISUAL",
85
  "shape": [
@@ -88,12 +117,10 @@
88
  640
89
  ]
90
  },
91
- "observation.images.wrist": {
92
- "type": "VISUAL",
93
  "shape": [
94
- 3,
95
- 480,
96
- 640
97
  ]
98
  }
99
  },
@@ -101,7 +128,7 @@
101
  "action": {
102
  "type": "ACTION",
103
  "shape": [
104
- 6
105
  ]
106
  }
107
  },
@@ -112,54 +139,35 @@
112
  "private": null,
113
  "tags": null,
114
  "license": null,
115
- "chunk_size": 50,
116
- "n_action_steps": 50,
117
- "max_state_dim": 32,
118
- "max_action_dim": 32,
119
- "resize_imgs_with_padding": [
120
- 512,
121
- 512
122
- ],
123
- "empty_cameras": 0,
124
- "adapt_to_pi_aloha": false,
125
- "use_delta_joint_actions_aloha": false,
126
- "tokenizer_max_length": 48,
127
- "num_steps": 10,
128
- "use_cache": true,
129
- "freeze_vision_encoder": true,
130
- "train_expert_only": true,
131
- "train_state_proj": true,
132
- "optimizer_lr": 0.0001,
133
- "optimizer_betas": [
134
- 0.9,
135
- 0.95
136
- ],
137
- "optimizer_eps": 1e-08,
138
- "optimizer_weight_decay": 1e-10,
139
- "optimizer_grad_clip_norm": 10.0,
140
- "scheduler_warmup_steps": 1000,
141
- "scheduler_decay_steps": 30000,
142
- "scheduler_decay_lr": 2.5e-06,
143
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
144
- "load_vlm_weights": true,
145
- "add_image_special_tokens": false,
146
- "attention_mode": "cross_attn",
147
- "prefix_length": 0,
148
- "pad_language_to": "max_length",
149
- "num_expert_layers": 0,
150
- "num_vlm_layers": 16,
151
- "self_attn_every_n_layers": 2,
152
- "expert_width_multiplier": 0.75,
153
- "min_period": 0.004,
154
- "max_period": 4.0
155
  },
156
- "output_dir": "outputs/train/2025-07-08/17-12-30_smolvla",
157
- "job_name": "smolvla",
158
  "resume": false,
159
  "seed": 1000,
160
  "num_workers": 4,
161
- "batch_size": 64,
162
- "steps": 200000,
163
  "eval_freq": 20000,
164
  "log_freq": 200,
165
  "save_checkpoint": true,
@@ -167,22 +175,16 @@
167
  "use_policy_training_preset": true,
168
  "optimizer": {
169
  "type": "adamw",
170
- "lr": 0.0001,
171
- "weight_decay": 1e-10,
172
  "grad_clip_norm": 10.0,
173
  "betas": [
174
  0.9,
175
- 0.95
176
  ],
177
  "eps": 1e-08
178
  },
179
- "scheduler": {
180
- "type": "cosine_decay_with_warmup",
181
- "num_warmup_steps": 1000,
182
- "num_decay_steps": 30000,
183
- "peak_lr": 0.0001,
184
- "decay_lr": 2.5e-06
185
- },
186
  "eval": {
187
  "n_episodes": 50,
188
  "batch_size": 50,
 
1
  {
2
  "dataset": {
3
+ "repo_id": "lerobot/aloha_sim_insertion_human",
4
  "root": null,
5
  "episodes": null,
6
  "image_transforms": {
 
64
  "use_imagenet_stats": true,
65
  "video_backend": "pyav"
66
  },
67
+ "env": {
68
+ "type": "aloha",
69
+ "task": "AlohaInsertion-v0",
70
+ "fps": 50,
71
+ "features": {
72
+ "action": {
73
+ "type": "ACTION",
74
+ "shape": [
75
+ 14
76
+ ]
77
+ },
78
+ "agent_pos": {
79
+ "type": "STATE",
80
+ "shape": [
81
+ 14
82
+ ]
83
+ },
84
+ "pixels/top": {
85
+ "type": "VISUAL",
86
+ "shape": [
87
+ 480,
88
+ 640,
89
+ 3
90
+ ]
91
+ }
92
+ },
93
+ "features_map": {
94
+ "action": "action",
95
+ "agent_pos": "observation.state",
96
+ "top": "observation.image.top",
97
+ "pixels/top": "observation.images.top"
98
+ },
99
+ "episode_length": 400,
100
+ "obs_type": "pixels_agent_pos",
101
+ "render_mode": "rgb_array"
102
+ },
103
  "policy": {
104
+ "type": "act",
105
  "n_obs_steps": 1,
106
  "normalization_mapping": {
107
+ "VISUAL": "MEAN_STD",
108
  "STATE": "MEAN_STD",
109
  "ACTION": "MEAN_STD"
110
  },
111
  "input_features": {
 
 
 
 
 
 
112
  "observation.images.top": {
113
  "type": "VISUAL",
114
  "shape": [
 
117
  640
118
  ]
119
  },
120
+ "observation.state": {
121
+ "type": "STATE",
122
  "shape": [
123
+ 14
 
 
124
  ]
125
  }
126
  },
 
128
  "action": {
129
  "type": "ACTION",
130
  "shape": [
131
+ 14
132
  ]
133
  }
134
  },
 
139
  "private": null,
140
  "tags": null,
141
  "license": null,
142
+ "chunk_size": 100,
143
+ "n_action_steps": 100,
144
+ "vision_backbone": "resnet18",
145
+ "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
146
+ "replace_final_stride_with_dilation": false,
147
+ "pre_norm": false,
148
+ "dim_model": 512,
149
+ "n_heads": 8,
150
+ "dim_feedforward": 3200,
151
+ "feedforward_activation": "relu",
152
+ "n_encoder_layers": 4,
153
+ "n_decoder_layers": 1,
154
+ "use_vae": true,
155
+ "latent_dim": 32,
156
+ "n_vae_encoder_layers": 4,
157
+ "temporal_ensemble_coeff": null,
158
+ "dropout": 0.1,
159
+ "kl_weight": 10.0,
160
+ "optimizer_lr": 1e-05,
161
+ "optimizer_weight_decay": 0.0001,
162
+ "optimizer_lr_backbone": 1e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  },
164
+ "output_dir": "outputs/train/act_aloha_insertion",
165
+ "job_name": "aloha_act",
166
  "resume": false,
167
  "seed": 1000,
168
  "num_workers": 4,
169
+ "batch_size": 8,
170
+ "steps": 100000,
171
  "eval_freq": 20000,
172
  "log_freq": 200,
173
  "save_checkpoint": true,
 
175
  "use_policy_training_preset": true,
176
  "optimizer": {
177
  "type": "adamw",
178
+ "lr": 1e-05,
179
+ "weight_decay": 0.0001,
180
  "grad_clip_norm": 10.0,
181
  "betas": [
182
  0.9,
183
+ 0.999
184
  ],
185
  "eps": 1e-08
186
  },
187
+ "scheduler": null,
 
 
 
 
 
 
188
  "eval": {
189
  "n_episodes": 50,
190
  "batch_size": 50,