Upload policy weights, train config and readme

Browse files

Files changed (4) hide show

README.md +7 -6
config.json +42 -23
model.safetensors +2 -2
train_config.json +57 -32

README.md CHANGED Viewed

@@ -1,21 +1,22 @@
 ---
 datasets: matanxp/record-complex-25
 library_name: lerobot
 license: apache-2.0
-model_name: act
 pipeline_tag: robotics
 tags:
 - lerobot
-- act
 - robotics
 ---
-# Model Card for act
 <!-- Provide a quick summary of what the model is/does. -->
-[Action Chunking with Transformers (ACT)](https://huggingface.co/papers/2304.13705) is an imitation-learning method that predicts short action chunks instead of single steps. It learns from teleoperated data and often achieves high success rates.
 This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
@@ -31,7 +32,7 @@ Below is the short version on how to train and run inference/eval:
 ### Train from scratch
 ```bash
-python -m lerobot.scripts.train \
   --dataset.repo_id=${HF_USER}/<dataset> \
   --policy.type=act \
   --output_dir=outputs/train/<desired_policy_repo_id> \
@@ -46,7 +47,7 @@ _Writes checkpoints to `outputs/train/<desired_policy_repo_id>/checkpoints/`._
 ### Evaluate the policy/run inference
 ```bash
-python -m lerobot.record \
   --robot.type=so100_follower \
   --dataset.repo_id=<hf_user>/eval_<dataset> \
   --policy.path=<hf_user>/<desired_policy_repo_id> \

 ---
+base_model: lerobot/smolvla_base
 datasets: matanxp/record-complex-25
 library_name: lerobot
 license: apache-2.0
+model_name: smolvla
 pipeline_tag: robotics
 tags:
 - lerobot
+- smolvla
 - robotics
 ---
+# Model Card for smolvla
 <!-- Provide a quick summary of what the model is/does. -->
+[SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
 This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
 ### Train from scratch
 ```bash
+lerobot-train \
   --dataset.repo_id=${HF_USER}/<dataset> \
   --policy.type=act \
   --output_dir=outputs/train/<desired_policy_repo_id> \
 ### Evaluate the policy/run inference
 ```bash
+lerobot-record \
   --robot.type=so100_follower \
   --dataset.repo_id=<hf_user>/eval_<dataset> \
   --policy.path=<hf_user>/<desired_policy_repo_id> \

config.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "type": "act",
     "n_obs_steps": 1,
     "normalization_mapping": {
-        "VISUAL": "MEAN_STD",
         "STATE": "MEAN_STD",
         "ACTION": "MEAN_STD"
     },
@@ -37,25 +37,44 @@
     "private": null,
     "tags": null,
     "license": null,
-    "chunk_size": 100,
-    "n_action_steps": 100,
-    "vision_backbone": "resnet18",
-    "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
-    "replace_final_stride_with_dilation": false,
-    "pre_norm": false,
-    "dim_model": 512,
-    "n_heads": 8,
-    "dim_feedforward": 3200,
-    "feedforward_activation": "relu",
-    "n_encoder_layers": 4,
-    "n_decoder_layers": 1,
-    "use_vae": true,
-    "latent_dim": 32,
-    "n_vae_encoder_layers": 4,
-    "temporal_ensemble_coeff": null,
-    "dropout": 0.1,
-    "kl_weight": 10.0,
-    "optimizer_lr": 1e-05,
-    "optimizer_weight_decay": 0.0001,
-    "optimizer_lr_backbone": 1e-05
 }

 {
+    "type": "smolvla",
     "n_obs_steps": 1,
     "normalization_mapping": {
+        "VISUAL": "IDENTITY",
         "STATE": "MEAN_STD",
         "ACTION": "MEAN_STD"
     },
     "private": null,
     "tags": null,
     "license": null,
+    "chunk_size": 50,
+    "n_action_steps": 50,
+    "max_state_dim": 32,
+    "max_action_dim": 32,
+    "resize_imgs_with_padding": [
+        512,
+        512
+    ],
+    "empty_cameras": 0,
+    "adapt_to_pi_aloha": false,
+    "use_delta_joint_actions_aloha": false,
+    "tokenizer_max_length": 48,
+    "num_steps": 10,
+    "use_cache": true,
+    "freeze_vision_encoder": true,
+    "train_expert_only": true,
+    "train_state_proj": true,
+    "optimizer_lr": 0.0001,
+    "optimizer_betas": [
+        0.9,
+        0.95
+    ],
+    "optimizer_eps": 1e-08,
+    "optimizer_weight_decay": 1e-10,
+    "optimizer_grad_clip_norm": 10.0,
+    "scheduler_warmup_steps": 1000,
+    "scheduler_decay_steps": 30000,
+    "scheduler_decay_lr": 2.5e-06,
+    "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
+    "load_vlm_weights": true,
+    "add_image_special_tokens": false,
+    "attention_mode": "cross_attn",
+    "prefix_length": 0,
+    "pad_language_to": "max_length",
+    "num_expert_layers": 0,
+    "num_vlm_layers": 16,
+    "self_attn_every_n_layers": 2,
+    "expert_width_multiplier": 0.75,
+    "min_period": 0.004,
+    "max_period": 4.0
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8318c083f29ffa4a0cd652762ea92fa710f5f309189bfeaded9f9b39bdd0bd0a
-size 206700792

 version https://git-lfs.github.com/spec/v1
+oid sha256:e5a9046fc1758ef60be392e471c1aade1e729dda713d94bca1d993f58859c264
+size 906713296

train_config.json CHANGED Viewed

@@ -66,10 +66,10 @@
     },
     "env": null,
     "policy": {
-        "type": "act",
         "n_obs_steps": 1,
         "normalization_mapping": {
-            "VISUAL": "MEAN_STD",
             "STATE": "MEAN_STD",
             "ACTION": "MEAN_STD"
         },
@@ -104,35 +104,54 @@
         "private": null,
         "tags": null,
         "license": null,
-        "chunk_size": 100,
-        "n_action_steps": 100,
-        "vision_backbone": "resnet18",
-        "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
-        "replace_final_stride_with_dilation": false,
-        "pre_norm": false,
-        "dim_model": 512,
-        "n_heads": 8,
-        "dim_feedforward": 3200,
-        "feedforward_activation": "relu",
-        "n_encoder_layers": 4,
-        "n_decoder_layers": 1,
-        "use_vae": true,
-        "latent_dim": 32,
-        "n_vae_encoder_layers": 4,
-        "temporal_ensemble_coeff": null,
-        "dropout": 0.1,
-        "kl_weight": 10.0,
-        "optimizer_lr": 1e-05,
-        "optimizer_weight_decay": 0.0001,
-        "optimizer_lr_backbone": 1e-05
     },
-    "output_dir": "outputs/train/act_record-complex-25",
-    "job_name": "act_record-complex-25",
     "resume": false,
     "seed": 1000,
     "num_workers": 4,
-    "batch_size": 8,
-    "steps": 100000,
     "eval_freq": 20000,
     "log_freq": 200,
     "save_checkpoint": true,
@@ -140,16 +159,22 @@
     "use_policy_training_preset": true,
     "optimizer": {
         "type": "adamw",
-        "lr": 1e-05,
-        "weight_decay": 0.0001,
         "grad_clip_norm": 10.0,
         "betas": [
             0.9,
-            0.999
         ],
         "eps": 1e-08
     },
-    "scheduler": null,
     "eval": {
         "n_episodes": 50,
         "batch_size": 50,
@@ -161,7 +186,7 @@
         "project": "lerobot",
         "entity": null,
         "notes": null,
-        "run_id": "k779etf4",
         "mode": null
     }
 }

     },
     "env": null,
     "policy": {
+        "type": "smolvla",
         "n_obs_steps": 1,
         "normalization_mapping": {
+            "VISUAL": "IDENTITY",
             "STATE": "MEAN_STD",
             "ACTION": "MEAN_STD"
         },
         "private": null,
         "tags": null,
         "license": null,
+        "chunk_size": 50,
+        "n_action_steps": 50,
+        "max_state_dim": 32,
+        "max_action_dim": 32,
+        "resize_imgs_with_padding": [
+            512,
+            512
+        ],
+        "empty_cameras": 0,
+        "adapt_to_pi_aloha": false,
+        "use_delta_joint_actions_aloha": false,
+        "tokenizer_max_length": 48,
+        "num_steps": 10,
+        "use_cache": true,
+        "freeze_vision_encoder": true,
+        "train_expert_only": true,
+        "train_state_proj": true,
+        "optimizer_lr": 0.0001,
+        "optimizer_betas": [
+            0.9,
+            0.95
+        ],
+        "optimizer_eps": 1e-08,
+        "optimizer_weight_decay": 1e-10,
+        "optimizer_grad_clip_norm": 10.0,
+        "scheduler_warmup_steps": 1000,
+        "scheduler_decay_steps": 30000,
+        "scheduler_decay_lr": 2.5e-06,
+        "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
+        "load_vlm_weights": true,
+        "add_image_special_tokens": false,
+        "attention_mode": "cross_attn",
+        "prefix_length": 0,
+        "pad_language_to": "max_length",
+        "num_expert_layers": 0,
+        "num_vlm_layers": 16,
+        "self_attn_every_n_layers": 2,
+        "expert_width_multiplier": 0.75,
+        "min_period": 0.004,
+        "max_period": 4.0
     },
+    "output_dir": "outputs/train/smolvla_record-complex-25",
+    "job_name": "smolvla_record-copmlex-25",
     "resume": false,
     "seed": 1000,
     "num_workers": 4,
+    "batch_size": 32,
+    "steps": 20000,
     "eval_freq": 20000,
     "log_freq": 200,
     "save_checkpoint": true,
     "use_policy_training_preset": true,
     "optimizer": {
         "type": "adamw",
+        "lr": 0.0001,
+        "weight_decay": 1e-10,
         "grad_clip_norm": 10.0,
         "betas": [
             0.9,
+            0.95
         ],
         "eps": 1e-08
     },
+    "scheduler": {
+        "type": "cosine_decay_with_warmup",
+        "num_warmup_steps": 1000,
+        "num_decay_steps": 30000,
+        "peak_lr": 0.0001,
+        "decay_lr": 2.5e-06
+    },
     "eval": {
         "n_episodes": 50,
         "batch_size": 50,
         "project": "lerobot",
         "entity": null,
         "notes": null,
+        "run_id": "u4osiv3x",
         "mode": null
     }
 }