Upload policy weights, train config and readme

Browse files

Files changed (4) hide show

README.md +63 -0
config.json +12 -36
model.safetensors +2 -2
train_config.json +16 -40

README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+base_model: lerobot/smolvla_base
+datasets: kobikelemen/towel_fold_trimmed
+library_name: lerobot
+license: apache-2.0
+model_name: smolvla
+pipeline_tag: robotics
+tags:
+- smolvla
+- robotics
+- lerobot
+---
+# Model Card for smolvla
+<!-- Provide a quick summary of what the model is/does. -->
+[SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
+This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
+See the full documentation at [LeRobot Docs](https://huggingface.co/docs/lerobot/index).
+---
+## How to Get Started with the Model
+For a complete walkthrough, see the [training guide](https://huggingface.co/docs/lerobot/il_robots#train-a-policy).
+Below is the short version on how to train and run inference/eval:
+### Train from scratch
+```bash
+lerobot-train \
+  --dataset.repo_id=${HF_USER}/<dataset> \
+  --policy.type=act \
+  --output_dir=outputs/train/<desired_policy_repo_id> \
+  --job_name=lerobot_training \
+  --policy.device=cuda \
+  --policy.repo_id=${HF_USER}/<desired_policy_repo_id>
+  --wandb.enable=true
+```
+_Writes checkpoints to `outputs/train/<desired_policy_repo_id>/checkpoints/`._
+### Evaluate the policy/run inference
+```bash
+lerobot-record \
+  --robot.type=so100_follower \
+  --dataset.repo_id=<hf_user>/eval_<dataset> \
+  --policy.path=<hf_user>/<desired_policy_repo_id> \
+  --episodes=10
+```
+Prefix the dataset repo with **eval\_** and supply `--policy.path` pointing to a local or hub checkpoint.
+---
+## Model Details
+- **License:** apache-2.0

config.json CHANGED Viewed

@@ -8,44 +8,20 @@
                 6
             ]
         },
-        "observation.images.camera1": {
-            "type": "VISUAL",
-            "shape": [
-                3,
-                256,
-                256
-            ]
-        },
-        "observation.images.camera2": {
-            "type": "VISUAL",
-            "shape": [
-                3,
-                256,
-                256
-            ]
-        },
-        "observation.images.camera3": {
-            "type": "VISUAL",
-            "shape": [
-                3,
-                256,
-                256
-            ]
-        },
         "observation.images.overhead": {
             "type": "VISUAL",
             "shape": [
                 3,
-                256,
-                256
             ]
         },
         "observation.images.wrist": {
             "type": "VISUAL",
             "shape": [
                 3,
-                256,
-                256
             ]
         }
     },
@@ -64,9 +40,9 @@
     "private": null,
     "tags": null,
     "license": null,
-    "pretrained_path": "lerobot/smolvla_base",
-    "chunk_size": 50,
-    "n_action_steps": 50,
     "normalization_mapping": {
         "VISUAL": "IDENTITY",
         "STATE": "MEAN_STD",
@@ -94,17 +70,17 @@
     ],
     "optimizer_eps": 1e-08,
     "optimizer_weight_decay": 1e-10,
-    "optimizer_grad_clip_norm": 10.0,
     "scheduler_warmup_steps": 1000,
     "scheduler_decay_steps": 30000,
     "scheduler_decay_lr": 2.5e-06,
     "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
-    "load_vlm_weights": true,
     "add_image_special_tokens": false,
     "attention_mode": "cross_attn",
-    "prefix_length": 0,
-    "pad_language_to": "max_length",
-    "num_expert_layers": 0,
     "num_vlm_layers": 16,
     "self_attn_every_n_layers": 2,
     "expert_width_multiplier": 0.75,

                 6
             ]
         },
         "observation.images.overhead": {
             "type": "VISUAL",
             "shape": [
                 3,
+                720,
+                1280
             ]
         },
         "observation.images.wrist": {
             "type": "VISUAL",
             "shape": [
                 3,
+                720,
+                1280
             ]
         }
     },
     "private": null,
     "tags": null,
     "license": null,
+    "pretrained_path": null,
+    "chunk_size": 20,
+    "n_action_steps": 20,
     "normalization_mapping": {
         "VISUAL": "IDENTITY",
         "STATE": "MEAN_STD",
     ],
     "optimizer_eps": 1e-08,
     "optimizer_weight_decay": 1e-10,
+    "optimizer_grad_clip_norm": 10,
     "scheduler_warmup_steps": 1000,
     "scheduler_decay_steps": 30000,
     "scheduler_decay_lr": 2.5e-06,
     "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
+    "load_vlm_weights": false,
     "add_image_special_tokens": false,
     "attention_mode": "cross_attn",
+    "prefix_length": -1,
+    "pad_language_to": "longest",
+    "num_expert_layers": -1,
     "num_vlm_layers": 16,
     "self_attn_every_n_layers": 2,
     "expert_width_multiplier": 0.75,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2f468e47143294e1793198f74fc41648f5a8196e7da5f0a948c6dd9af4fb89c
-size 906712520

 version https://git-lfs.github.com/spec/v1
+oid sha256:bad94f2fccee6bb0032045222af18fa89ba7ad2a27fc1f9efafbf4823992708c
+size 1197789224

train_config.json CHANGED Viewed

@@ -76,7 +76,7 @@
         },
         "revision": null,
         "use_imagenet_stats": true,
-        "video_backend": "pyav",
         "streaming": false
     },
     "env": null,
@@ -90,44 +90,20 @@
                     6
                 ]
             },
-            "observation.images.camera1": {
-                "type": "VISUAL",
-                "shape": [
-                    3,
-                    256,
-                    256
-                ]
-            },
-            "observation.images.camera2": {
-                "type": "VISUAL",
-                "shape": [
-                    3,
-                    256,
-                    256
-                ]
-            },
-            "observation.images.camera3": {
-                "type": "VISUAL",
-                "shape": [
-                    3,
-                    256,
-                    256
-                ]
-            },
             "observation.images.overhead": {
                 "type": "VISUAL",
                 "shape": [
                     3,
-                    256,
-                    256
                 ]
             },
             "observation.images.wrist": {
                 "type": "VISUAL",
                 "shape": [
                     3,
-                    256,
-                    256
                 ]
             }
         },
@@ -146,9 +122,9 @@
         "private": null,
         "tags": null,
         "license": null,
-        "pretrained_path": "lerobot/smolvla_base",
-        "chunk_size": 50,
-        "n_action_steps": 50,
         "normalization_mapping": {
             "VISUAL": "IDENTITY",
             "STATE": "MEAN_STD",
@@ -176,24 +152,24 @@
         ],
         "optimizer_eps": 1e-08,
         "optimizer_weight_decay": 1e-10,
-        "optimizer_grad_clip_norm": 10.0,
         "scheduler_warmup_steps": 1000,
         "scheduler_decay_steps": 30000,
         "scheduler_decay_lr": 2.5e-06,
         "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
-        "load_vlm_weights": true,
         "add_image_special_tokens": false,
         "attention_mode": "cross_attn",
-        "prefix_length": 0,
-        "pad_language_to": "max_length",
-        "num_expert_layers": 0,
         "num_vlm_layers": 16,
         "self_attn_every_n_layers": 2,
         "expert_width_multiplier": 0.75,
         "min_period": 0.004,
         "max_period": 4.0
     },
-    "output_dir": "outputs/train/2025-10-20/05-54-55_smolvla",
     "job_name": "smolvla",
     "resume": false,
     "seed": 1000,
@@ -209,7 +185,7 @@
         "type": "adamw",
         "lr": 0.0001,
         "weight_decay": 1e-10,
-        "grad_clip_norm": 10.0,
         "betas": [
             0.9,
             0.95
@@ -234,7 +210,7 @@
         "project": "lerobot",
         "entity": null,
         "notes": null,
-        "run_id": "ji942rc2",
         "mode": null
     }
 }

         },
         "revision": null,
         "use_imagenet_stats": true,
+        "video_backend": "torchcodec",
         "streaming": false
     },
     "env": null,
                     6
                 ]
             },
             "observation.images.overhead": {
                 "type": "VISUAL",
                 "shape": [
                     3,
+                    720,
+                    1280
                 ]
             },
             "observation.images.wrist": {
                 "type": "VISUAL",
                 "shape": [
                     3,
+                    720,
+                    1280
                 ]
             }
         },
         "private": null,
         "tags": null,
         "license": null,
+        "pretrained_path": null,
+        "chunk_size": 20,
+        "n_action_steps": 20,
         "normalization_mapping": {
             "VISUAL": "IDENTITY",
             "STATE": "MEAN_STD",
         ],
         "optimizer_eps": 1e-08,
         "optimizer_weight_decay": 1e-10,
+        "optimizer_grad_clip_norm": 10,
         "scheduler_warmup_steps": 1000,
         "scheduler_decay_steps": 30000,
         "scheduler_decay_lr": 2.5e-06,
         "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
+        "load_vlm_weights": false,
         "add_image_special_tokens": false,
         "attention_mode": "cross_attn",
+        "prefix_length": -1,
+        "pad_language_to": "longest",
+        "num_expert_layers": -1,
         "num_vlm_layers": 16,
         "self_attn_every_n_layers": 2,
         "expert_width_multiplier": 0.75,
         "min_period": 0.004,
         "max_period": 4.0
     },
+    "output_dir": "outputs/train/2025-10-21/05-34-13_smolvla",
     "job_name": "smolvla",
     "resume": false,
     "seed": 1000,
         "type": "adamw",
         "lr": 0.0001,
         "weight_decay": 1e-10,
+        "grad_clip_norm": 10,
         "betas": [
             0.9,
             0.95
         "project": "lerobot",
         "entity": null,
         "notes": null,
+        "run_id": "m4h1qq5w",
         "mode": null
     }
 }