Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +81 -0
config.json +97 -0
model.safetensors +3 -0
modeling_lerobot_policy.py +72 -0

README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+---
+license: apache-2.0
+tags:
+- lerobot
+- robotics
+- vision-language-model
+---
+# Infatoshi/smolvla
+This repository contains a `smolvla_base` policy trained with the [`lerobot`](https://github.com/huggingface/lerobot) framework.
+## Model Description
+This model is a Vision-Language-Action (VLA) policy that can take visual observations, proprioceptive states, and a language instruction to predict robot actions.
+- **Policy Type:** `smolvla`
+- **Dataset:** `gribok201/smolvla_koch4`
+- **VLM Backbone:** `HuggingFaceTB/SmolVLM2-500M-Video-Instruct`
+- **Trained Steps:** `10000`
+### I/O Schema
+**Input Features:**
+- `observation.image`: type `VISUAL`, shape `[3, 256, 256]`
+- `observation.image2`: type `VISUAL`, shape `[3, 256, 256]`
+- `observation.image3`: type `VISUAL`, shape `[3, 256, 256]`
+- `observation.state`: type `STATE`, shape `[6]`
+**Output Features:**
+- `action`: type `ACTION`, shape `[6]`
+**Image Preprocessing:**
+Images are expected to be resized to `[512, 512]` before being passed to the model.
+## How to Use
+This model can be loaded using `transformers.AutoModel` with `trust_remote_code=True`.
+**You MUST have `lerobot` installed in your environment for this to work.**
+(`pip install lerobot`)
+```python
+from transformers import AutoModel
+import torch
+from PIL import Image
+import torchvision.transforms as T
+# Replace with your model's repo_id
+repo_id = "Infatoshi/smolvla"
+# Load the model - CRITICAL: trust_remote_code=True
+# This executes the custom code in modeling_lerobot_policy.py
+model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
+model.eval()
+print("Model loaded successfully!")
+# Example Inference:
+# Create dummy inputs matching the model's expected schema.
+resize_shape = tuple(model.config.resize_imgs_with_padding)
+state_shape = tuple(model.config.input_features["observation.state"]["shape"])
+# Dummy observations dictionary
+dummy_observations = {
+    "state": torch.randn(1, *state_shape),
+    "images": {
+        "usb": torch.randn(1, 3, *resize_shape),
+        "brio": torch.randn(1, 3, *resize_shape),
+    }
+}
+dummy_language_instruction = "pick up the cube"
+with torch.no_grad():
+    output = model(
+        observations=dummy_observations,
+        language_instruction=dummy_language_instruction
+    )
+print("Inference output (predicted actions):", output)
+print("Output shape:", output.shape)
+```

config.json ADDED Viewed

	@@ -0,0 +1,97 @@

+{
+    "architectures": [
+        "LerobotSmolVLAWrappedModel"
+    ],
+    "auto_map": {
+        "AutoModel": "modeling_lerobot_policy.LerobotSmolVLAWrappedModel"
+    },
+    "adapt_to_pi_aloha": false,
+    "add_image_special_tokens": false,
+    "attention_mode": "cross_attn",
+    "chunk_size": 50,
+    "device": "cuda",
+    "empty_cameras": 0,
+    "expert_width_multiplier": 0.75,
+    "freeze_vision_encoder": true,
+    "input_features": {
+        "observation.image": {
+            "shape": [
+                3,
+                256,
+                256
+            ],
+            "type": "VISUAL"
+        },
+        "observation.image2": {
+            "shape": [
+                3,
+                256,
+                256
+            ],
+            "type": "VISUAL"
+        },
+        "observation.image3": {
+            "shape": [
+                3,
+                256,
+                256
+            ],
+            "type": "VISUAL"
+        },
+        "observation.state": {
+            "shape": [
+                6
+            ],
+            "type": "STATE"
+        }
+    },
+    "load_vlm_weights": true,
+    "max_action_dim": 32,
+    "max_period": 4,
+    "max_state_dim": 32,
+    "min_period": 0.004,
+    "n_action_steps": 50,
+    "n_obs_steps": 1,
+    "normalization_mapping": {
+        "ACTION": "MEAN_STD",
+        "STATE": "MEAN_STD",
+        "VISUAL": "IDENTITY"
+    },
+    "num_expert_layers": 0,
+    "num_steps": 10,
+    "num_vlm_layers": 16,
+    "optimizer_betas": [
+        0.9,
+        0.95
+    ],
+    "optimizer_eps": "1e-08",
+    "optimizer_grad_clip_norm": 10,
+    "optimizer_lr": 0.0001,
+    "optimizer_weight_decay": "1e-10",
+    "output_features": {
+        "action": {
+            "shape": [
+                6
+            ],
+            "type": "ACTION"
+        }
+    },
+    "pad_language_to": "max_length",
+    "prefix_length": 0,
+    "resize_imgs_with_padding": [
+        512,
+        512
+    ],
+    "scheduler_decay_lr": 2.5e-06,
+    "scheduler_decay_steps": 30000,
+    "scheduler_warmup_steps": 1000,
+    "self_attn_every_n_layers": 2,
+    "tokenizer_max_length": 48,
+    "train_expert_only": true,
+    "train_state_proj": true,
+    "type": "smolvla",
+    "use_amp": false,
+    "use_cache": true,
+    "use_delta_joint_actions_aloha": false,
+    "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eeb41c1ae577c22207c9d16e3c0a303ee2dc00ad2b19ea987fdcd47db2d3283e
+size 906713296

modeling_lerobot_policy.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch.nn as nn
+from transformers import PreTrainedModel, PretrainedConfig
+# This import assumes 'lerobot' is installed in the user's environment
+from lerobot.smolvla_base import SmolVLABasePolicy
+class LerobotSmolVLAConfig(PretrainedConfig):
+    model_type = "lerobot_smolvla"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.adapt_to_pi_aloha = False
+        self.add_image_special_tokens = False
+        self.attention_mode = 'cross_attn'
+        self.chunk_size = 50
+        self.device = 'cuda'
+        self.empty_cameras = 0
+        self.expert_width_multiplier = 0.75
+        self.freeze_vision_encoder = True
+        self.input_features = {'observation.image': {'shape': [3, 256, 256], 'type': 'VISUAL'}, 'observation.image2': {'shape': [3, 256, 256], 'type': 'VISUAL'}, 'observation.image3': {'shape': [3, 256, 256], 'type': 'VISUAL'}, 'observation.state': {'shape': [6], 'type': 'STATE'}}
+        self.load_vlm_weights = True
+        self.max_action_dim = 32
+        self.max_period = 4
+        self.max_state_dim = 32
+        self.min_period = 0.004
+        self.n_action_steps = 50
+        self.n_obs_steps = 1
+        self.normalization_mapping = {'ACTION': 'MEAN_STD', 'STATE': 'MEAN_STD', 'VISUAL': 'IDENTITY'}
+        self.num_expert_layers = 0
+        self.num_steps = 10
+        self.num_vlm_layers = 16
+        self.optimizer_betas = [0.9, 0.95]
+        self.optimizer_eps = '1e-08'
+        self.optimizer_grad_clip_norm = 10
+        self.optimizer_lr = 0.0001
+        self.optimizer_weight_decay = '1e-10'
+        self.output_features = {'action': {'shape': [6], 'type': 'ACTION'}}
+        self.pad_language_to = 'max_length'
+        self.prefix_length = 0
+        self.resize_imgs_with_padding = [512, 512]
+        self.scheduler_decay_lr = 2.5e-06
+        self.scheduler_decay_steps = 30000
+        self.scheduler_warmup_steps = 1000
+        self.self_attn_every_n_layers = 2
+        self.tokenizer_max_length = 48
+        self.train_expert_only = True
+        self.train_state_proj = True
+        self.type = 'smolvla'
+        self.use_amp = False
+        self.use_cache = True
+        self.use_delta_joint_actions_aloha = False
+        self.vlm_model_name = 'HuggingFaceTB/SmolVLM2-500M-Video-Instruct'
+        for k, v in kwargs.items():
+            if not hasattr(self, k):
+                setattr(self, k, v)
+class LerobotSmolVLAWrappedModel(PreTrainedModel):
+    config_class = LerobotSmolVLAConfig
+    def __init__(self, config):
+        super().__init__(config)
+        # to_dict() correctly extracts all config parameters for the policy
+        policy_init_kwargs = config.to_dict()
+        self.smolvla_policy = SmolVLABasePolicy(**policy_init_kwargs)
+    def forward(self, observations, actions=None, language_instruction=None, timestep=None):
+        # This explicit signature is better for usability and documentation
+        return self.smolvla_policy(
+            observations=observations,
+            actions=actions,
+            language_instruction=language_instruction,
+            timestep=timestep
+        )