Robotics
LeRobot
Safetensors
act
msr915 commited on
Commit
bd1ecec
·
verified ·
1 Parent(s): 4edd786

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +10 -11
  2. config.json +35 -47
  3. model.safetensors +2 -2
  4. train_config.json +67 -68
README.md CHANGED
@@ -1,22 +1,21 @@
1
  ---
2
- base_model: lerobot/smolvla_base
3
- datasets: aloha/push_T_top_to_bottom_mixture_5_100percent
4
  library_name: lerobot
5
  license: apache-2.0
6
- model_name: smolvla
7
  pipeline_tag: robotics
8
  tags:
9
- - lerobot
10
  - robotics
11
- - smolvla
12
  ---
13
 
14
- # Model Card for smolvla
15
 
16
  <!-- Provide a quick summary of what the model is/does. -->
17
 
18
 
19
- [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
 
21
 
22
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
@@ -32,7 +31,7 @@ Below is the short version on how to train and run inference/eval:
32
  ### Train from scratch
33
 
34
  ```bash
35
- python -m lerobot.scripts.train \
36
  --dataset.repo_id=${HF_USER}/<dataset> \
37
  --policy.type=act \
38
  --output_dir=outputs/train/<desired_policy_repo_id> \
@@ -42,12 +41,12 @@ python -m lerobot.scripts.train \
42
  --wandb.enable=true
43
  ```
44
 
45
- *Writes checkpoints to `outputs/train/<desired_policy_repo_id>/checkpoints/`.*
46
 
47
  ### Evaluate the policy/run inference
48
 
49
  ```bash
50
- python -m lerobot.record \
51
  --robot.type=so100_follower \
52
  --dataset.repo_id=<hf_user>/eval_<dataset> \
53
  --policy.path=<hf_user>/<desired_policy_repo_id> \
@@ -60,4 +59,4 @@ Prefix the dataset repo with **eval\_** and supply `--policy.path` pointing to a
60
 
61
  ## Model Details
62
 
63
- * **License:** apache-2.0
 
1
  ---
2
+ datasets: aloha/simulation_push_T_bottom_right
 
3
  library_name: lerobot
4
  license: apache-2.0
5
+ model_name: act
6
  pipeline_tag: robotics
7
  tags:
8
+ - act
9
  - robotics
10
+ - lerobot
11
  ---
12
 
13
+ # Model Card for act
14
 
15
  <!-- Provide a quick summary of what the model is/does. -->
16
 
17
 
18
+ [Action Chunking with Transformers (ACT)](https://huggingface.co/papers/2304.13705) is an imitation-learning method that predicts short action chunks instead of single steps. It learns from teleoperated data and often achieves high success rates.
19
 
20
 
21
  This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 
31
  ### Train from scratch
32
 
33
  ```bash
34
+ lerobot-train \
35
  --dataset.repo_id=${HF_USER}/<dataset> \
36
  --policy.type=act \
37
  --output_dir=outputs/train/<desired_policy_repo_id> \
 
41
  --wandb.enable=true
42
  ```
43
 
44
+ _Writes checkpoints to `outputs/train/<desired_policy_repo_id>/checkpoints/`._
45
 
46
  ### Evaluate the policy/run inference
47
 
48
  ```bash
49
+ lerobot-record \
50
  --robot.type=so100_follower \
51
  --dataset.repo_id=<hf_user>/eval_<dataset> \
52
  --policy.path=<hf_user>/<desired_policy_repo_id> \
 
59
 
60
  ## Model Details
61
 
62
+ - **License:** apache-2.0
config.json CHANGED
@@ -1,12 +1,13 @@
1
  {
2
- "type": "smolvla",
3
  "n_obs_steps": 1,
4
- "normalization_mapping": {
5
- "VISUAL": "IDENTITY",
6
- "STATE": "MEAN_STD",
7
- "ACTION": "MEAN_STD"
8
- },
9
  "input_features": {
 
 
 
 
 
 
10
  "observation.images.top": {
11
  "type": "VISUAL",
12
  "shape": [
@@ -25,50 +26,37 @@
25
  }
26
  },
27
  "device": "cuda",
28
- "use_amp": true,
29
  "push_to_hub": true,
30
  "repo_id": "smolvla",
31
  "private": null,
32
  "tags": null,
33
  "license": null,
34
- "chunk_size": 50,
35
- "n_action_steps": 50,
36
- "max_state_dim": 32,
37
- "max_action_dim": 32,
38
- "resize_imgs_with_padding": [
39
- 512,
40
- 512
41
- ],
42
- "empty_cameras": 0,
43
- "adapt_to_pi_aloha": false,
44
- "use_delta_joint_actions_aloha": false,
45
- "tokenizer_max_length": 48,
46
- "num_steps": 10,
47
- "use_cache": true,
48
- "freeze_vision_encoder": true,
49
- "train_expert_only": true,
50
- "train_state_proj": true,
51
- "optimizer_lr": 0.0001,
52
- "optimizer_betas": [
53
- 0.9,
54
- 0.95
55
- ],
56
- "optimizer_eps": 1e-08,
57
- "optimizer_weight_decay": 1e-10,
58
- "optimizer_grad_clip_norm": 10,
59
- "scheduler_warmup_steps": 1000,
60
- "scheduler_decay_steps": 30000,
61
- "scheduler_decay_lr": 2.5e-06,
62
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
63
- "load_vlm_weights": false,
64
- "add_image_special_tokens": false,
65
- "attention_mode": "cross_attn",
66
- "prefix_length": -1,
67
- "pad_language_to": "longest",
68
- "num_expert_layers": -1,
69
- "num_vlm_layers": 16,
70
- "self_attn_every_n_layers": 2,
71
- "expert_width_multiplier": 0.75,
72
- "min_period": 0.004,
73
- "max_period": 4.0
74
  }
 
1
  {
2
+ "type": "act",
3
  "n_obs_steps": 1,
 
 
 
 
 
4
  "input_features": {
5
+ "observation.state": {
6
+ "type": "STATE",
7
+ "shape": [
8
+ 2
9
+ ]
10
+ },
11
  "observation.images.top": {
12
  "type": "VISUAL",
13
  "shape": [
 
26
  }
27
  },
28
  "device": "cuda",
29
+ "use_amp": false,
30
  "push_to_hub": true,
31
  "repo_id": "smolvla",
32
  "private": null,
33
  "tags": null,
34
  "license": null,
35
+ "pretrained_path": null,
36
+ "chunk_size": 100,
37
+ "n_action_steps": 100,
38
+ "normalization_mapping": {
39
+ "VISUAL": "MEAN_STD",
40
+ "STATE": "MEAN_STD",
41
+ "ACTION": "MEAN_STD"
42
+ },
43
+ "vision_backbone": "resnet18",
44
+ "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
45
+ "replace_final_stride_with_dilation": false,
46
+ "pre_norm": false,
47
+ "dim_model": 512,
48
+ "n_heads": 8,
49
+ "dim_feedforward": 3200,
50
+ "feedforward_activation": "relu",
51
+ "n_encoder_layers": 4,
52
+ "n_decoder_layers": 1,
53
+ "use_vae": true,
54
+ "latent_dim": 32,
55
+ "n_vae_encoder_layers": 4,
56
+ "temporal_ensemble_coeff": null,
57
+ "dropout": 0.1,
58
+ "kl_weight": 10.0,
59
+ "optimizer_lr": 1e-05,
60
+ "optimizer_weight_decay": 0.0001,
61
+ "optimizer_lr_backbone": 1e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:171863e6c2dc85af0751d5c3a75d19d2e177b66ddc2b39bee09d8dfe4532a2db
3
- size 1197789712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:145f8bcf9e8e2ba95f132654d3940e9024907917119ecf73cceafd2b93fa7eab
3
+ size 206675136
train_config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "dataset": {
3
- "repo_id": "aloha/push_T_top_to_bottom_mixture_5_100percent",
4
  "root": null,
5
  "episodes": null,
6
  "image_transforms": {
7
- "enable": true,
8
  "max_num_transforms": 3,
9
- "random_order": true,
10
  "tfs": {
11
  "brightness": {
12
  "weight": 1.0,
@@ -57,23 +57,39 @@
57
  1.5
58
  ]
59
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  }
61
  }
62
  },
63
  "revision": null,
64
  "use_imagenet_stats": true,
65
- "video_backend": "pyav"
 
66
  },
67
  "env": null,
68
  "policy": {
69
- "type": "smolvla",
70
  "n_obs_steps": 1,
71
- "normalization_mapping": {
72
- "VISUAL": "IDENTITY",
73
- "STATE": "MEAN_STD",
74
- "ACTION": "MEAN_STD"
75
- },
76
  "input_features": {
 
 
 
 
 
 
77
  "observation.images.top": {
78
  "type": "VISUAL",
79
  "shape": [
@@ -92,83 +108,64 @@
92
  }
93
  },
94
  "device": "cuda",
95
- "use_amp": true,
96
  "push_to_hub": true,
97
  "repo_id": "smolvla",
98
  "private": null,
99
  "tags": null,
100
  "license": null,
101
- "chunk_size": 50,
102
- "n_action_steps": 50,
103
- "max_state_dim": 32,
104
- "max_action_dim": 32,
105
- "resize_imgs_with_padding": [
106
- 512,
107
- 512
108
- ],
109
- "empty_cameras": 0,
110
- "adapt_to_pi_aloha": false,
111
- "use_delta_joint_actions_aloha": false,
112
- "tokenizer_max_length": 48,
113
- "num_steps": 10,
114
- "use_cache": true,
115
- "freeze_vision_encoder": true,
116
- "train_expert_only": true,
117
- "train_state_proj": true,
118
- "optimizer_lr": 0.0001,
119
- "optimizer_betas": [
120
- 0.9,
121
- 0.95
122
- ],
123
- "optimizer_eps": 1e-08,
124
- "optimizer_weight_decay": 1e-10,
125
- "optimizer_grad_clip_norm": 10,
126
- "scheduler_warmup_steps": 1000,
127
- "scheduler_decay_steps": 30000,
128
- "scheduler_decay_lr": 2.5e-06,
129
- "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
130
- "load_vlm_weights": false,
131
- "add_image_special_tokens": false,
132
- "attention_mode": "cross_attn",
133
- "prefix_length": -1,
134
- "pad_language_to": "longest",
135
- "num_expert_layers": -1,
136
- "num_vlm_layers": 16,
137
- "self_attn_every_n_layers": 2,
138
- "expert_width_multiplier": 0.75,
139
- "min_period": 0.004,
140
- "max_period": 4.0
141
  },
142
- "output_dir": "mixture_models/5augmented_smolvla/100percent/policy",
143
- "job_name": "smolvla",
144
  "resume": false,
145
  "seed": 100000,
146
  "num_workers": 4,
147
  "batch_size": 64,
148
- "steps": 30000,
149
  "eval_freq": 20000,
150
  "log_freq": 200,
151
  "save_checkpoint": true,
152
- "save_freq": 30000,
153
  "use_policy_training_preset": true,
154
  "optimizer": {
155
  "type": "adamw",
156
- "lr": 0.0001,
157
- "weight_decay": 1e-10,
158
- "grad_clip_norm": 10,
159
  "betas": [
160
  0.9,
161
- 0.95
162
  ],
163
  "eps": 1e-08
164
  },
165
- "scheduler": {
166
- "type": "cosine_decay_with_warmup",
167
- "num_warmup_steps": 1000,
168
- "num_decay_steps": 30000,
169
- "peak_lr": 0.0001,
170
- "decay_lr": 2.5e-06
171
- },
172
  "eval": {
173
  "n_episodes": 50,
174
  "batch_size": 50,
@@ -180,7 +177,9 @@
180
  "project": "lerobot",
181
  "entity": null,
182
  "notes": null,
183
- "run_id": "bqimq78k",
184
  "mode": null
185
- }
 
 
186
  }
 
1
  {
2
  "dataset": {
3
+ "repo_id": "aloha/simulation_push_T_bottom_right",
4
  "root": null,
5
  "episodes": null,
6
  "image_transforms": {
7
+ "enable": false,
8
  "max_num_transforms": 3,
9
+ "random_order": false,
10
  "tfs": {
11
  "brightness": {
12
  "weight": 1.0,
 
57
  1.5
58
  ]
59
  }
60
+ },
61
+ "affine": {
62
+ "weight": 1.0,
63
+ "type": "RandomAffine",
64
+ "kwargs": {
65
+ "degrees": [
66
+ -5.0,
67
+ 5.0
68
+ ],
69
+ "translate": [
70
+ 0.05,
71
+ 0.05
72
+ ]
73
+ }
74
  }
75
  }
76
  },
77
  "revision": null,
78
  "use_imagenet_stats": true,
79
+ "video_backend": "pyav",
80
+ "streaming": false
81
  },
82
  "env": null,
83
  "policy": {
84
+ "type": "act",
85
  "n_obs_steps": 1,
 
 
 
 
 
86
  "input_features": {
87
+ "observation.state": {
88
+ "type": "STATE",
89
+ "shape": [
90
+ 2
91
+ ]
92
+ },
93
  "observation.images.top": {
94
  "type": "VISUAL",
95
  "shape": [
 
108
  }
109
  },
110
  "device": "cuda",
111
+ "use_amp": false,
112
  "push_to_hub": true,
113
  "repo_id": "smolvla",
114
  "private": null,
115
  "tags": null,
116
  "license": null,
117
+ "pretrained_path": null,
118
+ "chunk_size": 100,
119
+ "n_action_steps": 100,
120
+ "normalization_mapping": {
121
+ "VISUAL": "MEAN_STD",
122
+ "STATE": "MEAN_STD",
123
+ "ACTION": "MEAN_STD"
124
+ },
125
+ "vision_backbone": "resnet18",
126
+ "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
127
+ "replace_final_stride_with_dilation": false,
128
+ "pre_norm": false,
129
+ "dim_model": 512,
130
+ "n_heads": 8,
131
+ "dim_feedforward": 3200,
132
+ "feedforward_activation": "relu",
133
+ "n_encoder_layers": 4,
134
+ "n_decoder_layers": 1,
135
+ "use_vae": true,
136
+ "latent_dim": 32,
137
+ "n_vae_encoder_layers": 4,
138
+ "temporal_ensemble_coeff": null,
139
+ "dropout": 0.1,
140
+ "kl_weight": 10.0,
141
+ "optimizer_lr": 1e-05,
142
+ "optimizer_weight_decay": 0.0001,
143
+ "optimizer_lr_backbone": 1e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  },
145
+ "output_dir": "sim_policy/pushT_bottom_right_smolvla",
146
+ "job_name": "act",
147
  "resume": false,
148
  "seed": 100000,
149
  "num_workers": 4,
150
  "batch_size": 64,
151
+ "steps": 100000,
152
  "eval_freq": 20000,
153
  "log_freq": 200,
154
  "save_checkpoint": true,
155
+ "save_freq": 25000,
156
  "use_policy_training_preset": true,
157
  "optimizer": {
158
  "type": "adamw",
159
+ "lr": 1e-05,
160
+ "weight_decay": 0.0001,
161
+ "grad_clip_norm": 10.0,
162
  "betas": [
163
  0.9,
164
+ 0.999
165
  ],
166
  "eps": 1e-08
167
  },
168
+ "scheduler": null,
 
 
 
 
 
 
169
  "eval": {
170
  "n_episodes": 50,
171
  "batch_size": 50,
 
177
  "project": "lerobot",
178
  "entity": null,
179
  "notes": null,
180
+ "run_id": "l9y3vem8",
181
  "mode": null
182
+ },
183
+ "checkpoint_path": null,
184
+ "rename_map": {}
185
  }