Upload folder using huggingface_hub
Browse files- README.md +81 -0
- config.json +97 -0
- model.safetensors +3 -0
- modeling_lerobot_policy.py +72 -0
README.md
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- lerobot
|
| 5 |
+
- robotics
|
| 6 |
+
- vision-language-model
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
# Infatoshi/smolvla
|
| 10 |
+
|
| 11 |
+
This repository contains a `smolvla_base` policy trained with the [`lerobot`](https://github.com/huggingface/lerobot) framework.
|
| 12 |
+
|
| 13 |
+
## Model Description
|
| 14 |
+
|
| 15 |
+
This model is a Vision-Language-Action (VLA) policy that can take visual observations, proprioceptive states, and a language instruction to predict robot actions.
|
| 16 |
+
|
| 17 |
+
- **Policy Type:** `smolvla`
|
| 18 |
+
- **Dataset:** `gribok201/smolvla_koch4`
|
| 19 |
+
- **VLM Backbone:** `HuggingFaceTB/SmolVLM2-500M-Video-Instruct`
|
| 20 |
+
- **Trained Steps:** `10000`
|
| 21 |
+
|
| 22 |
+
### I/O Schema
|
| 23 |
+
|
| 24 |
+
**Input Features:**
|
| 25 |
+
- `observation.image`: type `VISUAL`, shape `[3, 256, 256]`
|
| 26 |
+
- `observation.image2`: type `VISUAL`, shape `[3, 256, 256]`
|
| 27 |
+
- `observation.image3`: type `VISUAL`, shape `[3, 256, 256]`
|
| 28 |
+
- `observation.state`: type `STATE`, shape `[6]`
|
| 29 |
+
|
| 30 |
+
**Output Features:**
|
| 31 |
+
- `action`: type `ACTION`, shape `[6]`
|
| 32 |
+
|
| 33 |
+
**Image Preprocessing:**
|
| 34 |
+
Images are expected to be resized to `[512, 512]` before being passed to the model.
|
| 35 |
+
|
| 36 |
+
## How to Use
|
| 37 |
+
|
| 38 |
+
This model can be loaded using `transformers.AutoModel` with `trust_remote_code=True`.
|
| 39 |
+
**You MUST have `lerobot` installed in your environment for this to work.**
|
| 40 |
+
(`pip install lerobot`)
|
| 41 |
+
|
| 42 |
+
```python
|
| 43 |
+
from transformers import AutoModel
|
| 44 |
+
import torch
|
| 45 |
+
from PIL import Image
|
| 46 |
+
import torchvision.transforms as T
|
| 47 |
+
|
| 48 |
+
# Replace with your model's repo_id
|
| 49 |
+
repo_id = "Infatoshi/smolvla"
|
| 50 |
+
|
| 51 |
+
# Load the model - CRITICAL: trust_remote_code=True
|
| 52 |
+
# This executes the custom code in modeling_lerobot_policy.py
|
| 53 |
+
model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
|
| 54 |
+
model.eval()
|
| 55 |
+
|
| 56 |
+
print("Model loaded successfully!")
|
| 57 |
+
|
| 58 |
+
# Example Inference:
|
| 59 |
+
# Create dummy inputs matching the model's expected schema.
|
| 60 |
+
resize_shape = tuple(model.config.resize_imgs_with_padding)
|
| 61 |
+
state_shape = tuple(model.config.input_features["observation.state"]["shape"])
|
| 62 |
+
|
| 63 |
+
# Dummy observations dictionary
|
| 64 |
+
dummy_observations = {
|
| 65 |
+
"state": torch.randn(1, *state_shape),
|
| 66 |
+
"images": {
|
| 67 |
+
"usb": torch.randn(1, 3, *resize_shape),
|
| 68 |
+
"brio": torch.randn(1, 3, *resize_shape),
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
dummy_language_instruction = "pick up the cube"
|
| 72 |
+
|
| 73 |
+
with torch.no_grad():
|
| 74 |
+
output = model(
|
| 75 |
+
observations=dummy_observations,
|
| 76 |
+
language_instruction=dummy_language_instruction
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
print("Inference output (predicted actions):", output)
|
| 80 |
+
print("Output shape:", output.shape)
|
| 81 |
+
```
|
config.json
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LerobotSmolVLAWrappedModel"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoModel": "modeling_lerobot_policy.LerobotSmolVLAWrappedModel"
|
| 7 |
+
},
|
| 8 |
+
"adapt_to_pi_aloha": false,
|
| 9 |
+
"add_image_special_tokens": false,
|
| 10 |
+
"attention_mode": "cross_attn",
|
| 11 |
+
"chunk_size": 50,
|
| 12 |
+
"device": "cuda",
|
| 13 |
+
"empty_cameras": 0,
|
| 14 |
+
"expert_width_multiplier": 0.75,
|
| 15 |
+
"freeze_vision_encoder": true,
|
| 16 |
+
"input_features": {
|
| 17 |
+
"observation.image": {
|
| 18 |
+
"shape": [
|
| 19 |
+
3,
|
| 20 |
+
256,
|
| 21 |
+
256
|
| 22 |
+
],
|
| 23 |
+
"type": "VISUAL"
|
| 24 |
+
},
|
| 25 |
+
"observation.image2": {
|
| 26 |
+
"shape": [
|
| 27 |
+
3,
|
| 28 |
+
256,
|
| 29 |
+
256
|
| 30 |
+
],
|
| 31 |
+
"type": "VISUAL"
|
| 32 |
+
},
|
| 33 |
+
"observation.image3": {
|
| 34 |
+
"shape": [
|
| 35 |
+
3,
|
| 36 |
+
256,
|
| 37 |
+
256
|
| 38 |
+
],
|
| 39 |
+
"type": "VISUAL"
|
| 40 |
+
},
|
| 41 |
+
"observation.state": {
|
| 42 |
+
"shape": [
|
| 43 |
+
6
|
| 44 |
+
],
|
| 45 |
+
"type": "STATE"
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"load_vlm_weights": true,
|
| 49 |
+
"max_action_dim": 32,
|
| 50 |
+
"max_period": 4,
|
| 51 |
+
"max_state_dim": 32,
|
| 52 |
+
"min_period": 0.004,
|
| 53 |
+
"n_action_steps": 50,
|
| 54 |
+
"n_obs_steps": 1,
|
| 55 |
+
"normalization_mapping": {
|
| 56 |
+
"ACTION": "MEAN_STD",
|
| 57 |
+
"STATE": "MEAN_STD",
|
| 58 |
+
"VISUAL": "IDENTITY"
|
| 59 |
+
},
|
| 60 |
+
"num_expert_layers": 0,
|
| 61 |
+
"num_steps": 10,
|
| 62 |
+
"num_vlm_layers": 16,
|
| 63 |
+
"optimizer_betas": [
|
| 64 |
+
0.9,
|
| 65 |
+
0.95
|
| 66 |
+
],
|
| 67 |
+
"optimizer_eps": "1e-08",
|
| 68 |
+
"optimizer_grad_clip_norm": 10,
|
| 69 |
+
"optimizer_lr": 0.0001,
|
| 70 |
+
"optimizer_weight_decay": "1e-10",
|
| 71 |
+
"output_features": {
|
| 72 |
+
"action": {
|
| 73 |
+
"shape": [
|
| 74 |
+
6
|
| 75 |
+
],
|
| 76 |
+
"type": "ACTION"
|
| 77 |
+
}
|
| 78 |
+
},
|
| 79 |
+
"pad_language_to": "max_length",
|
| 80 |
+
"prefix_length": 0,
|
| 81 |
+
"resize_imgs_with_padding": [
|
| 82 |
+
512,
|
| 83 |
+
512
|
| 84 |
+
],
|
| 85 |
+
"scheduler_decay_lr": 2.5e-06,
|
| 86 |
+
"scheduler_decay_steps": 30000,
|
| 87 |
+
"scheduler_warmup_steps": 1000,
|
| 88 |
+
"self_attn_every_n_layers": 2,
|
| 89 |
+
"tokenizer_max_length": 48,
|
| 90 |
+
"train_expert_only": true,
|
| 91 |
+
"train_state_proj": true,
|
| 92 |
+
"type": "smolvla",
|
| 93 |
+
"use_amp": false,
|
| 94 |
+
"use_cache": true,
|
| 95 |
+
"use_delta_joint_actions_aloha": false,
|
| 96 |
+
"vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
|
| 97 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eeb41c1ae577c22207c9d16e3c0a303ee2dc00ad2b19ea987fdcd47db2d3283e
|
| 3 |
+
size 906713296
|
modeling_lerobot_policy.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from transformers import PreTrainedModel, PretrainedConfig
|
| 4 |
+
# This import assumes 'lerobot' is installed in the user's environment
|
| 5 |
+
from lerobot.smolvla_base import SmolVLABasePolicy
|
| 6 |
+
|
| 7 |
+
class LerobotSmolVLAConfig(PretrainedConfig):
|
| 8 |
+
model_type = "lerobot_smolvla"
|
| 9 |
+
def __init__(self, **kwargs):
|
| 10 |
+
super().__init__(**kwargs)
|
| 11 |
+
self.adapt_to_pi_aloha = False
|
| 12 |
+
self.add_image_special_tokens = False
|
| 13 |
+
self.attention_mode = 'cross_attn'
|
| 14 |
+
self.chunk_size = 50
|
| 15 |
+
self.device = 'cuda'
|
| 16 |
+
self.empty_cameras = 0
|
| 17 |
+
self.expert_width_multiplier = 0.75
|
| 18 |
+
self.freeze_vision_encoder = True
|
| 19 |
+
self.input_features = {'observation.image': {'shape': [3, 256, 256], 'type': 'VISUAL'}, 'observation.image2': {'shape': [3, 256, 256], 'type': 'VISUAL'}, 'observation.image3': {'shape': [3, 256, 256], 'type': 'VISUAL'}, 'observation.state': {'shape': [6], 'type': 'STATE'}}
|
| 20 |
+
self.load_vlm_weights = True
|
| 21 |
+
self.max_action_dim = 32
|
| 22 |
+
self.max_period = 4
|
| 23 |
+
self.max_state_dim = 32
|
| 24 |
+
self.min_period = 0.004
|
| 25 |
+
self.n_action_steps = 50
|
| 26 |
+
self.n_obs_steps = 1
|
| 27 |
+
self.normalization_mapping = {'ACTION': 'MEAN_STD', 'STATE': 'MEAN_STD', 'VISUAL': 'IDENTITY'}
|
| 28 |
+
self.num_expert_layers = 0
|
| 29 |
+
self.num_steps = 10
|
| 30 |
+
self.num_vlm_layers = 16
|
| 31 |
+
self.optimizer_betas = [0.9, 0.95]
|
| 32 |
+
self.optimizer_eps = '1e-08'
|
| 33 |
+
self.optimizer_grad_clip_norm = 10
|
| 34 |
+
self.optimizer_lr = 0.0001
|
| 35 |
+
self.optimizer_weight_decay = '1e-10'
|
| 36 |
+
self.output_features = {'action': {'shape': [6], 'type': 'ACTION'}}
|
| 37 |
+
self.pad_language_to = 'max_length'
|
| 38 |
+
self.prefix_length = 0
|
| 39 |
+
self.resize_imgs_with_padding = [512, 512]
|
| 40 |
+
self.scheduler_decay_lr = 2.5e-06
|
| 41 |
+
self.scheduler_decay_steps = 30000
|
| 42 |
+
self.scheduler_warmup_steps = 1000
|
| 43 |
+
self.self_attn_every_n_layers = 2
|
| 44 |
+
self.tokenizer_max_length = 48
|
| 45 |
+
self.train_expert_only = True
|
| 46 |
+
self.train_state_proj = True
|
| 47 |
+
self.type = 'smolvla'
|
| 48 |
+
self.use_amp = False
|
| 49 |
+
self.use_cache = True
|
| 50 |
+
self.use_delta_joint_actions_aloha = False
|
| 51 |
+
self.vlm_model_name = 'HuggingFaceTB/SmolVLM2-500M-Video-Instruct'
|
| 52 |
+
|
| 53 |
+
for k, v in kwargs.items():
|
| 54 |
+
if not hasattr(self, k):
|
| 55 |
+
setattr(self, k, v)
|
| 56 |
+
|
| 57 |
+
class LerobotSmolVLAWrappedModel(PreTrainedModel):
|
| 58 |
+
config_class = LerobotSmolVLAConfig
|
| 59 |
+
def __init__(self, config):
|
| 60 |
+
super().__init__(config)
|
| 61 |
+
# to_dict() correctly extracts all config parameters for the policy
|
| 62 |
+
policy_init_kwargs = config.to_dict()
|
| 63 |
+
self.smolvla_policy = SmolVLABasePolicy(**policy_init_kwargs)
|
| 64 |
+
|
| 65 |
+
def forward(self, observations, actions=None, language_instruction=None, timestep=None):
|
| 66 |
+
# This explicit signature is better for usability and documentation
|
| 67 |
+
return self.smolvla_policy(
|
| 68 |
+
observations=observations,
|
| 69 |
+
actions=actions,
|
| 70 |
+
language_instruction=language_instruction,
|
| 71 |
+
timestep=timestep
|
| 72 |
+
)
|