gribok201 commited on
Commit
2f14779
·
verified ·
1 Parent(s): 303e610

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +81 -0
  2. config.json +97 -0
  3. model.safetensors +3 -0
  4. modeling_lerobot_policy.py +72 -0
README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - lerobot
5
+ - robotics
6
+ - vision-language-model
7
+ ---
8
+
9
+ # Infatoshi/smolvla
10
+
11
+ This repository contains a `smolvla_base` policy trained with the [`lerobot`](https://github.com/huggingface/lerobot) framework.
12
+
13
+ ## Model Description
14
+
15
+ This model is a Vision-Language-Action (VLA) policy that can take visual observations, proprioceptive states, and a language instruction to predict robot actions.
16
+
17
+ - **Policy Type:** `smolvla`
18
+ - **Dataset:** `gribok201/smolvla_koch4`
19
+ - **VLM Backbone:** `HuggingFaceTB/SmolVLM2-500M-Video-Instruct`
20
+ - **Trained Steps:** `10000`
21
+
22
+ ### I/O Schema
23
+
24
+ **Input Features:**
25
+ - `observation.image`: type `VISUAL`, shape `[3, 256, 256]`
26
+ - `observation.image2`: type `VISUAL`, shape `[3, 256, 256]`
27
+ - `observation.image3`: type `VISUAL`, shape `[3, 256, 256]`
28
+ - `observation.state`: type `STATE`, shape `[6]`
29
+
30
+ **Output Features:**
31
+ - `action`: type `ACTION`, shape `[6]`
32
+
33
+ **Image Preprocessing:**
34
+ Images are expected to be resized to `[512, 512]` before being passed to the model.
35
+
36
+ ## How to Use
37
+
38
+ This model can be loaded using `transformers.AutoModel` with `trust_remote_code=True`.
39
+ **You MUST have `lerobot` installed in your environment for this to work.**
40
+ (`pip install lerobot`)
41
+
42
+ ```python
43
+ from transformers import AutoModel
44
+ import torch
45
+ from PIL import Image
46
+ import torchvision.transforms as T
47
+
48
+ # Replace with your model's repo_id
49
+ repo_id = "Infatoshi/smolvla"
50
+
51
+ # Load the model - CRITICAL: trust_remote_code=True
52
+ # This executes the custom code in modeling_lerobot_policy.py
53
+ model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
54
+ model.eval()
55
+
56
+ print("Model loaded successfully!")
57
+
58
+ # Example Inference:
59
+ # Create dummy inputs matching the model's expected schema.
60
+ resize_shape = tuple(model.config.resize_imgs_with_padding)
61
+ state_shape = tuple(model.config.input_features["observation.state"]["shape"])
62
+
63
+ # Dummy observations dictionary
64
+ dummy_observations = {
65
+ "state": torch.randn(1, *state_shape),
66
+ "images": {
67
+ "usb": torch.randn(1, 3, *resize_shape),
68
+ "brio": torch.randn(1, 3, *resize_shape),
69
+ }
70
+ }
71
+ dummy_language_instruction = "pick up the cube"
72
+
73
+ with torch.no_grad():
74
+ output = model(
75
+ observations=dummy_observations,
76
+ language_instruction=dummy_language_instruction
77
+ )
78
+
79
+ print("Inference output (predicted actions):", output)
80
+ print("Output shape:", output.shape)
81
+ ```
config.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LerobotSmolVLAWrappedModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoModel": "modeling_lerobot_policy.LerobotSmolVLAWrappedModel"
7
+ },
8
+ "adapt_to_pi_aloha": false,
9
+ "add_image_special_tokens": false,
10
+ "attention_mode": "cross_attn",
11
+ "chunk_size": 50,
12
+ "device": "cuda",
13
+ "empty_cameras": 0,
14
+ "expert_width_multiplier": 0.75,
15
+ "freeze_vision_encoder": true,
16
+ "input_features": {
17
+ "observation.image": {
18
+ "shape": [
19
+ 3,
20
+ 256,
21
+ 256
22
+ ],
23
+ "type": "VISUAL"
24
+ },
25
+ "observation.image2": {
26
+ "shape": [
27
+ 3,
28
+ 256,
29
+ 256
30
+ ],
31
+ "type": "VISUAL"
32
+ },
33
+ "observation.image3": {
34
+ "shape": [
35
+ 3,
36
+ 256,
37
+ 256
38
+ ],
39
+ "type": "VISUAL"
40
+ },
41
+ "observation.state": {
42
+ "shape": [
43
+ 6
44
+ ],
45
+ "type": "STATE"
46
+ }
47
+ },
48
+ "load_vlm_weights": true,
49
+ "max_action_dim": 32,
50
+ "max_period": 4,
51
+ "max_state_dim": 32,
52
+ "min_period": 0.004,
53
+ "n_action_steps": 50,
54
+ "n_obs_steps": 1,
55
+ "normalization_mapping": {
56
+ "ACTION": "MEAN_STD",
57
+ "STATE": "MEAN_STD",
58
+ "VISUAL": "IDENTITY"
59
+ },
60
+ "num_expert_layers": 0,
61
+ "num_steps": 10,
62
+ "num_vlm_layers": 16,
63
+ "optimizer_betas": [
64
+ 0.9,
65
+ 0.95
66
+ ],
67
+ "optimizer_eps": "1e-08",
68
+ "optimizer_grad_clip_norm": 10,
69
+ "optimizer_lr": 0.0001,
70
+ "optimizer_weight_decay": "1e-10",
71
+ "output_features": {
72
+ "action": {
73
+ "shape": [
74
+ 6
75
+ ],
76
+ "type": "ACTION"
77
+ }
78
+ },
79
+ "pad_language_to": "max_length",
80
+ "prefix_length": 0,
81
+ "resize_imgs_with_padding": [
82
+ 512,
83
+ 512
84
+ ],
85
+ "scheduler_decay_lr": 2.5e-06,
86
+ "scheduler_decay_steps": 30000,
87
+ "scheduler_warmup_steps": 1000,
88
+ "self_attn_every_n_layers": 2,
89
+ "tokenizer_max_length": 48,
90
+ "train_expert_only": true,
91
+ "train_state_proj": true,
92
+ "type": "smolvla",
93
+ "use_amp": false,
94
+ "use_cache": true,
95
+ "use_delta_joint_actions_aloha": false,
96
+ "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
97
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eeb41c1ae577c22207c9d16e3c0a303ee2dc00ad2b19ea987fdcd47db2d3283e
3
+ size 906713296
modeling_lerobot_policy.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel, PretrainedConfig
4
+ # This import assumes 'lerobot' is installed in the user's environment
5
+ from lerobot.smolvla_base import SmolVLABasePolicy
6
+
7
+ class LerobotSmolVLAConfig(PretrainedConfig):
8
+ model_type = "lerobot_smolvla"
9
+ def __init__(self, **kwargs):
10
+ super().__init__(**kwargs)
11
+ self.adapt_to_pi_aloha = False
12
+ self.add_image_special_tokens = False
13
+ self.attention_mode = 'cross_attn'
14
+ self.chunk_size = 50
15
+ self.device = 'cuda'
16
+ self.empty_cameras = 0
17
+ self.expert_width_multiplier = 0.75
18
+ self.freeze_vision_encoder = True
19
+ self.input_features = {'observation.image': {'shape': [3, 256, 256], 'type': 'VISUAL'}, 'observation.image2': {'shape': [3, 256, 256], 'type': 'VISUAL'}, 'observation.image3': {'shape': [3, 256, 256], 'type': 'VISUAL'}, 'observation.state': {'shape': [6], 'type': 'STATE'}}
20
+ self.load_vlm_weights = True
21
+ self.max_action_dim = 32
22
+ self.max_period = 4
23
+ self.max_state_dim = 32
24
+ self.min_period = 0.004
25
+ self.n_action_steps = 50
26
+ self.n_obs_steps = 1
27
+ self.normalization_mapping = {'ACTION': 'MEAN_STD', 'STATE': 'MEAN_STD', 'VISUAL': 'IDENTITY'}
28
+ self.num_expert_layers = 0
29
+ self.num_steps = 10
30
+ self.num_vlm_layers = 16
31
+ self.optimizer_betas = [0.9, 0.95]
32
+ self.optimizer_eps = '1e-08'
33
+ self.optimizer_grad_clip_norm = 10
34
+ self.optimizer_lr = 0.0001
35
+ self.optimizer_weight_decay = '1e-10'
36
+ self.output_features = {'action': {'shape': [6], 'type': 'ACTION'}}
37
+ self.pad_language_to = 'max_length'
38
+ self.prefix_length = 0
39
+ self.resize_imgs_with_padding = [512, 512]
40
+ self.scheduler_decay_lr = 2.5e-06
41
+ self.scheduler_decay_steps = 30000
42
+ self.scheduler_warmup_steps = 1000
43
+ self.self_attn_every_n_layers = 2
44
+ self.tokenizer_max_length = 48
45
+ self.train_expert_only = True
46
+ self.train_state_proj = True
47
+ self.type = 'smolvla'
48
+ self.use_amp = False
49
+ self.use_cache = True
50
+ self.use_delta_joint_actions_aloha = False
51
+ self.vlm_model_name = 'HuggingFaceTB/SmolVLM2-500M-Video-Instruct'
52
+
53
+ for k, v in kwargs.items():
54
+ if not hasattr(self, k):
55
+ setattr(self, k, v)
56
+
57
+ class LerobotSmolVLAWrappedModel(PreTrainedModel):
58
+ config_class = LerobotSmolVLAConfig
59
+ def __init__(self, config):
60
+ super().__init__(config)
61
+ # to_dict() correctly extracts all config parameters for the policy
62
+ policy_init_kwargs = config.to_dict()
63
+ self.smolvla_policy = SmolVLABasePolicy(**policy_init_kwargs)
64
+
65
+ def forward(self, observations, actions=None, language_instruction=None, timestep=None):
66
+ # This explicit signature is better for usability and documentation
67
+ return self.smolvla_policy(
68
+ observations=observations,
69
+ actions=actions,
70
+ language_instruction=language_instruction,
71
+ timestep=timestep
72
+ )