2toINF commited on Apr 29

Commit

dd37dbc

verified ·

1 Parent(s): 6e6403b

Upload ckpt-200000 (X-VLA generalist)

Browse files

Files changed (18) hide show

__init__.py +0 -0
action_hub.py +295 -0
config.json +260 -0
configuration_florence2.py +340 -0
configuration_xvla.py +103 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_florence2.py +0 -0
modeling_xvla.py +435 -0
preprocessor_config.json +32 -0
processing_xvla.py +205 -0
server.py +95 -0
special_tokens_map.json +15 -0
state.json +1 -0
tokenizer.json +0 -0
tokenizer_config.json +59 -0
transformer.py +403 -0
vocab.json +0 -0

__init__.py ADDED Viewed

File without changes

action_hub.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# ------------------------------------------------------------------------------
+# Copyright 2025 2toINF (https://github.com/2toINF)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------
+from __future__ import annotations
+from typing import Iterable, Tuple, Dict, Type
+import torch
+import torch.nn as nn
+# =============================================================================
+# Registry
+# =============================================================================
+ACTION_REGISTRY: Dict[str, Type["BaseActionSpace"]] = {}
+def register_action(name: str):
+    """Decorator for registering a new action space."""
+    def _wrap(cls):
+        key = name.lower()
+        if key in ACTION_REGISTRY:
+            raise KeyError(f"ActionSpace '{key}' already registered -> {ACTION_REGISTRY[key]}")
+        ACTION_REGISTRY[key] = cls
+        cls.name = key
+        return cls
+    return _wrap
+def build_action_space(name: str, **kwargs) -> "BaseActionSpace":
+    """Instantiate a registered action space by name."""
+    key = name.lower()
+    if key not in ACTION_REGISTRY:
+        raise KeyError(f"Unknown action space '{name}'. Available: {list(ACTION_REGISTRY.keys())}")
+    return ACTION_REGISTRY[key](**kwargs)
+# =============================================================================
+# Base class
+# =============================================================================
+class BaseActionSpace(nn.Module):
+    """
+    Abstract base class for all action-space definitions.
+    Each subclass defines:
+      - `dim_action`: dimension of the action vector.
+      - `gripper_idx`: indices of gripper channels.
+      - `compute_loss(pred, target)`: supervised loss for this space.
+      - `preprocess(proprio, action, mode)`: pre-step modifications.
+      - `postprocess(action)`: post-step corrections (e.g. apply sigmoid).
+    """
+    name: str = "base"
+    dim_action: int = 0
+    idx_for_delta: Tuple[int, ...] = ()
+    def __init__(self, **kwargs):
+        super().__init__()
+    # ---------------------------------------------------------------------
+    # Core supervised loss
+    # ---------------------------------------------------------------------
+    def compute_loss(self, pred: torch.Tensor, target: torch.Tensor) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError
+    def forward(self, pred: torch.Tensor, target: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """Alias for compute_loss."""
+        return self.compute_loss(pred, target)
+    def prepare_for_training(self, action, proprio):
+        """Prepare action and proprio for training (e.g. delta encoding)."""
+        return action, proprio
+    # ---------------------------------------------------------------------
+    # Space-level hooks
+    # ---------------------------------------------------------------------
+    def preprocess(
+        self,
+        proprio: torch.Tensor,
+        action: torch.Tensor,
+        mode: str = "train",
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Default: return unchanged."""
+        return proprio, action
+    def postprocess(self,
+                    action: torch.Tensor,
+                    **kwargs
+                    ) -> torch.Tensor:
+        """Default: return unchanged."""
+        return action
+# =============================================================================
+# Utilities
+# =============================================================================
+def _ensure_indices_valid(D: int, idx: Iterable[int], name: str) -> None:
+    bad = [i for i in idx if i < 0 or i >= D]
+    if bad:
+        raise IndexError(f"{name} contains out-of-range indices {bad} for action dim D={D}")
+# =============================================================================
+# Implementations
+# =============================================================================
+@register_action("ee6d")
+class EE6DActionSpace(BaseActionSpace):
+    """End-effector layout with xyz, 6D rotation, and gripper channels."""
+    dim_action = 20
+    gripper_idx = (9, 19)
+    GRIPPER_SCALE = 1.0
+    XYZ_SCALE = 500.0
+    ROT_SCALE = 10.0
+    POS_IDX_1 = (0, 1, 2)
+    POS_IDX_2 = (10, 11, 12)
+    ROT_IDX_1 = (3, 4, 5, 6, 7, 8)
+    ROT_IDX_2 = (13, 14, 15, 16, 17, 18)
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.mse = nn.MSELoss()
+        self.bce = nn.BCEWithLogitsLoss()
+    def compute_loss(self, pred, target):
+        assert pred.shape == target.shape, "pred/target shapes must match"
+        B, T, D = pred.shape
+        _ensure_indices_valid(D, self.gripper_idx, "gripper_idx")
+        # Gripper BCE
+        g_losses = [self.bce(pred[:, :, gi], target[:, :, gi]) for gi in self.gripper_idx]
+        gripper_loss = sum(g_losses) / len(self.gripper_idx) * self.GRIPPER_SCALE
+        # XYZ position
+        pos_loss = (
+            self.mse(pred[:, :, self.POS_IDX_1], target[:, :, self.POS_IDX_1]) +
+            self.mse(pred[:, :, self.POS_IDX_2], target[:, :, self.POS_IDX_2])
+        ) * self.XYZ_SCALE
+        # Rotation 6D
+        rot_loss = (
+            self.mse(pred[:, :, self.ROT_IDX_1], target[:, :, self.ROT_IDX_1]) +
+            self.mse(pred[:, :, self.ROT_IDX_2], target[:, :, self.ROT_IDX_2])
+        ) * self.ROT_SCALE
+        return {
+            "position_loss": pos_loss,
+            "rotate6D_loss": rot_loss,
+            "gripper_loss": gripper_loss,
+        }
+    def preprocess(self, proprio, action, mode="train"):
+        """Zero-out gripper channels in proprio/action."""
+        proprio_m = proprio.clone()
+        action_m = action.clone()
+        proprio_m[..., self.gripper_idx] = 0.0
+        action_m[..., self.gripper_idx] = 0.0
+        return proprio_m, action_m
+    def postprocess(self, action: torch.Tensor, proprio: torch.Tensor) -> torch.Tensor:
+        """Apply sigmoid to gripper logits."""
+        if action.size(-1) > max(self.gripper_idx):
+            action[..., self.gripper_idx] = torch.sigmoid(action[..., self.gripper_idx])
+        return super().postprocess(action, proprio)
+@register_action("auto")
+class AutoActionSpace(BaseActionSpace):
+    """
+    Auto-detecting action space that adapts to any action dimension.
+    - Model outputs max_dim for compatibility with pretrained models
+    - Loss is computed only on the first real_dim dimensions
+    - Postprocess trims output back to real_dim
+    Args:
+        real_dim: The actual action dimension from the dataset/policy feature
+        max_dim: The model's output dimension for pretrained VLA compatibility
+    """
+    SCALE = 100.0
+    def __init__(self,
+                 real_dim: int,
+                 max_dim: int = 20,
+                 idx_for_delta: Tuple[int, ...] = (),
+                 idx_for_mask_proprio: Tuple[int, ...] = (),
+                 **kwargs
+                ):
+        super().__init__()
+        self.real_dim = real_dim
+        self.dim_action = max_dim  # Model-facing dimension
+        self.idx_for_delta = idx_for_delta
+        self.idx_for_mask_proprio = idx_for_mask_proprio
+        self.mse = nn.MSELoss()
+    def _pad_to_model_dim(self, x: torch.Tensor) -> torch.Tensor:
+        """Pad real_dim → max_dim (zeros for the dummy channels)."""
+        if x is None:
+            return None
+        if x.size(-1) == self.dim_action:
+            return x
+        if x.size(-1) != self.real_dim:
+            # If dimension doesn't match either, pad/trim to real_dim first
+            if x.size(-1) < self.real_dim:
+                pad_shape = list(x.shape[:-1]) + [self.real_dim - x.size(-1)]
+                pad = x.new_zeros(pad_shape)
+                x = torch.cat([x, pad], dim=-1)
+            else:
+                x = x[..., : self.real_dim]
+        pad_shape = list(x.shape[:-1]) + [self.dim_action - self.real_dim]
+        pad = x.new_zeros(pad_shape)
+        return torch.cat([x, pad], dim=-1)
+    def _trim_to_real_dim(self, x: torch.Tensor) -> torch.Tensor:
+        """Trim model output max_dim → real_dim."""
+        return x[..., : self.real_dim]
+    def compute_loss(self, pred: torch.Tensor, target: torch.Tensor) -> dict[str, torch.Tensor]:
+        """
+        Compute loss only on the first real_dim dimensions.
+        pred:   [B, T, max_dim] from the model
+        target: [B, T, real_dim] or [B, T, max_dim]
+        Loss = MSE(pred[:,:,:real_dim], target[:,:,:real_dim])
+        """
+        pred = self._pad_to_model_dim(pred)
+        target = self._pad_to_model_dim(target)
+        assert pred.shape == target.shape, f"Shape mismatch: pred {pred.shape} vs target {target.shape}"
+        # only compute loss on the real dimensions
+        loss = (
+            self.mse(
+                pred[:, :, : self.real_dim],
+                target[:, :, : self.real_dim],
+            )
+            * self.SCALE
+        )
+        return {"loss": loss}
+    def prepare_for_training(self, action, proprio):
+        action = action.clone()
+        proprio = proprio.clone()
+        # apply delta encoding if specified
+        if self.idx_for_delta:
+            action[..., self.idx_for_delta] -= proprio[..., self.idx_for_delta]
+        if self.idx_for_mask_proprio:
+            proprio[..., self.idx_for_mask_proprio] = 0.0
+        return action, proprio
+    def preprocess(self, proprio: torch.Tensor, action: torch.Tensor, mode: str = "train"):
+        """
+        Pad action from real_dim to max_dim for the model.
+        """
+        proprio = self._pad_to_model_dim(proprio)
+        if self.idx_for_mask_proprio:
+            proprio[..., self.idx_for_mask_proprio] = 0.0
+        return proprio, self._pad_to_model_dim(action)
+    def postprocess(self, action: torch.Tensor, proprio: torch.Tensor) -> torch.Tensor:
+        """
+        Trim model output from max_dim to real_dim for real robot control.
+        """
+        if self.idx_for_delta:
+            action = action.clone()
+            action[..., self.idx_for_delta] += proprio[..., self.idx_for_delta]
+        return self._trim_to_real_dim(action)
+# =============================================================================
+# Exports
+# =============================================================================
+__all__ = [
+    "BaseActionSpace",
+    "build_action_space",
+    "register_action",
+    "EE6DActionSpace",
+    "JointActionSpace",
+    "AGIBOTEE6DActionSpace",
+    "AutoActionSpace",
+    "ACTION_REGISTRY",
+]

config.json ADDED Viewed

	@@ -0,0 +1,260 @@

+{
+  "action_mode": "auto",
+  "architectures": [
+    "XVLA"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_xvla.XVLAConfig",
+    "AutoModel": "modeling_xvla.XVLA"
+  },
+  "depth": 24,
+  "dim_time": 32,
+  "florence_config": {
+    "_attn_implementation_autoset": true,
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "ignore_index": -100,
+    "is_encoder_decoder": true,
+    "model_type": "florence2",
+    "pad_token_id": 1,
+    "projection_dim": 1024,
+    "text_config": {
+      "_attn_implementation_autoset": true,
+      "_name_or_path": "",
+      "activation_dropout": 0.1,
+      "activation_function": "gelu",
+      "add_bias_logits": false,
+      "add_cross_attention": false,
+      "add_final_layer_norm": false,
+      "architectures": null,
+      "attention_dropout": 0.1,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": 0,
+      "chunk_size_feed_forward": 0,
+      "classif_dropout": 0.1,
+      "classifier_dropout": 0.0,
+      "cross_attention_hidden_size": null,
+      "d_model": 1024,
+      "decoder_attention_heads": 16,
+      "decoder_ffn_dim": 4096,
+      "decoder_layerdrop": 0.0,
+      "decoder_layers": 12,
+      "decoder_start_token_id": 2,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "dropout": 0.1,
+      "early_stopping": true,
+      "encoder_attention_heads": 16,
+      "encoder_ffn_dim": 4096,
+      "encoder_layerdrop": 0.0,
+      "encoder_layers": 12,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": 2,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": 0,
+      "forced_eos_token_id": 2,
+      "gradient_checkpointing": false,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1",
+        "2": "LABEL_2"
+      },
+      "init_std": 0.02,
+      "is_decoder": false,
+      "is_encoder_decoder": true,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1,
+        "LABEL_2": 2
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "max_position_embeddings": 4096,
+      "min_length": 0,
+      "model_type": "florence2_language",
+      "no_repeat_ngram_size": 3,
+      "normalize_before": false,
+      "num_beam_groups": 1,
+      "num_beams": 3,
+      "num_hidden_layers": 12,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": 1,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "scale_embedding": false,
+      "sep_token_id": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false,
+      "use_cache": true,
+      "vocab_size": 51289
+    },
+    "torch_dtype": "float32",
+    "vision_config": {
+      "_attn_implementation_autoset": false,
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "depths": [
+        1,
+        1,
+        9,
+        1
+      ],
+      "dim_embed": [
+        256,
+        512,
+        1024,
+        2048
+      ],
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "drop_path_rate": 0.1,
+      "early_stopping": false,
+      "enable_checkpoint": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "image_feature_source": [
+        "spatial_avg_pool",
+        "temporal_avg_pool"
+      ],
+      "image_pos_embed": {
+        "max_pos_embeddings": 50,
+        "type": "learned_abs_2d"
+      },
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_type": "davit",
+      "no_repeat_ngram_size": 0,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_groups": [
+        8,
+        16,
+        32,
+        64
+      ],
+      "num_heads": [
+        8,
+        16,
+        32,
+        64
+      ],
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "patch_padding": [
+        3,
+        1,
+        1,
+        1
+      ],
+      "patch_prenorm": [
+        false,
+        true,
+        true,
+        true
+      ],
+      "patch_size": [
+        7,
+        3,
+        3,
+        3
+      ],
+      "patch_stride": [
+        4,
+        2,
+        2,
+        2
+      ],
+      "prefix": null,
+      "problem_type": null,
+      "projection_dim": 1024,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false,
+      "visual_temporal_embedding": {
+        "max_temporal_embeddings": 100,
+        "type": "COSINE"
+      },
+      "window_size": 12
+    },
+    "vocab_size": 51289
+  },
+  "hidden_size": 1024,
+  "idx_for_delta": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+  "idx_for_mask_proprio": [12, 13, 14, 15, 16, 17, 18],
+  "len_soft_prompts": 32,
+  "max_action_dim": 20,
+  "max_len_seq": 512,
+  "mlp_ratio": 4.0,
+  "model_type": "xvla",
+  "num_actions": 30,
+  "num_domains": 30,
+  "num_heads": 16,
+  "real_action_dim": 20,
+  "soft_prompt_length": 32,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "use_hetero_proj": false,
+  "use_proprio": true
+}

configuration_florence2.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+""" Florence-2 configuration"""
+from typing import Optional
+from transformers import AutoConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Florence2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            The dropout rate of the drop path layer.
+        patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
+            The patch size of the image.
+        patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
+            The patch stride of the image.
+        patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
+            The patch padding of the image.
+        patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
+            Whether to apply layer normalization before the patch embedding layer.
+        enable_checkpoint (`bool`, *optional*, defaults to False):
+            Whether to enable checkpointing.
+        dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
+            The dimension of the embedding layer.
+        num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+            The number of attention heads.
+        num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+            The number of groups.
+        depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
+            The depth of the model.
+        window_size (`int`, *optional*, defaults to 12):
+            The window size of the model.
+        projection_dim (`int`, *optional*, defaults to 1024):
+            The dimension of the projection layer.
+        visual_temporal_embedding (`dict`, *optional*):
+            The configuration of the visual temporal embedding.
+        image_pos_embed (`dict`, *optional*):
+            The configuration of the image position embedding.
+        image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
+            The source of the image feature.
+    Example:
+    ```python
+    >>> from transformers import Florence2VisionConfig, Florence2VisionModel
+    >>> # Initializing a Florence2 Vision style configuration
+    >>> configuration = Florence2VisionConfig()
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2VisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "davit"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        drop_path_rate=0.1,
+        patch_size=[7, 3, 3, 3],
+        patch_stride=[4, 2, 2, 2],
+        patch_padding=[3, 1, 1, 1],
+        patch_prenorm=[False, True, True, True],
+        enable_checkpoint=False,
+        dim_embed=[256, 512, 1024, 2048],
+        num_heads=[8, 16, 32, 64],
+        num_groups=[8, 16, 32, 64],
+        depths=[1, 1, 9, 1],
+        window_size=12,
+        projection_dim=1024,
+        visual_temporal_embedding=None,
+        image_pos_embed=None,
+        image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
+        **kwargs,
+    ):
+        self.drop_path_rate = drop_path_rate
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.patch_prenorm = patch_prenorm
+        self.enable_checkpoint = enable_checkpoint
+        self.dim_embed = dim_embed
+        self.num_heads = num_heads
+        self.num_groups = num_groups
+        self.depths = depths
+        self.window_size = window_size
+        self.projection_dim = projection_dim
+        self.visual_temporal_embedding = visual_temporal_embedding
+        self.image_pos_embed = image_pos_embed
+        self.image_feature_source = image_feature_source
+        super().__init__(**kwargs)
+class Florence2LanguageConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the BART
+    [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 51289):
+            Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Florence2LanguageModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels (`int`, *optional*, defaults to 3):
+            The number of labels to use in [`Florence2LanguageForSequenceClassification`].
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+    Example:
+    ```python
+    >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
+    >>> # Initializing a Florence2 Language style configuration
+    >>> configuration = Florence2LanguageConfig()
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2LangaugeModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "florence2_language"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    def __init__(
+        self,
+        vocab_size=51289,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        num_labels=3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        decoder_start_token_id=2,
+        forced_eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        super().__init__(
+            num_labels=num_labels,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+        # ensure backward compatibility for BART CNN models
+        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+            self.forced_bos_token_id = self.bos_token_id
+            warnings.warn(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
+                "The config can simply be saved and uploaded again to be fixed."
+            )
+class Florence2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
+    Florence-2 model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`Florence2VisionConfig`,  *optional*):
+            Custom vision config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        vocab_size (`int`, *optional*, defaults to 51289):
+            Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
+        projection_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the multimodal projection space.
+    Example:
+    ```python
+    >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
+    >>> # Initializing a clip-like vision config
+    >>> vision_config = CLIPVisionConfig()
+    >>> # Initializing a Bart config
+    >>> text_config = BartConfig()
+    >>> # Initializing a Florence-2 configuration
+    >>> configuration = Florence2Config(vision_config, text_config)
+    >>> # Initializing a model from the florence-2 configuration
+    >>> model = Florence2ForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "florence2"
+    is_composition = False
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        vocab_size=51289,
+        projection_dim=1024,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.vocab_size = vocab_size
+        self.projection_dim = projection_dim
+        if vision_config is not None:
+            vision_config = Florence2VisionConfig(**vision_config)
+        self.vision_config = vision_config
+        self.vocab_size = self.vocab_size
+        self.text_config = text_config
+        if text_config is not None:
+            self.text_config = Florence2LanguageConfig(**text_config)
+        super().__init__(**kwargs)

configuration_xvla.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# ------------------------------------------------------------------------------
+# Copyright 2025 2toINF (https://github.com/2toINF)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------
+from .configuration_florence2 import Florence2Config
+from transformers.configuration_utils import PretrainedConfig
+class XVLAConfig(PretrainedConfig):
+    """
+    Configuration class for the **XVLA (Extended Vision-Language-Action)** model.
+    This configuration defines all submodules of XVLA in a single place:
+      - The visual-language backbone (Florence2)
+      - The temporal/action transformer
+      - The action/proprio setup
+    """
+    model_type = "xvla"
+    def __init__(
+        # === Florence backbone ===
+        self,
+        florence_config: dict | None = None,
+        # === Transformer head ===
+        hidden_size: int = 1024,
+        depth: int = 24,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        num_domains: int = 30,
+        len_soft_prompts: int = 32,
+        dim_time: int = 32,
+        max_len_seq: int = 512,
+        use_hetero_proj: bool = False,
+        soft_prompt_length: int = 32,
+        # === Action & proprio ===
+        max_action_dim: int = 20,  # Maximum action dimension for padding (used by "auto" action mode)
+        real_action_dim: int = 20,
+        idx_for_delta: int = (),  # Indices of action dimensions to apply delta encoding
+        idx_for_mask_proprio: int = (),  # Indices of proprio dimensions to mask
+        num_actions: int = 30,
+        action_mode: str = "ee6d",
+        use_proprio: bool = True,
+        **kwargs,
+    ):
+        # Florence2 backbone configuration
+        if isinstance(florence_config, dict):
+            self.florence_config = Florence2Config(**florence_config)
+        elif isinstance(florence_config, Florence2Config):
+            self.florence_config = florence_config
+        else:
+            self.florence_config = Florence2Config()
+        # Transformer hyperparameters
+        self.hidden_size = hidden_size
+        self.depth = depth
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.num_domains = num_domains
+        self.len_soft_prompts = len_soft_prompts
+        self.dim_time = dim_time
+        self.max_len_seq = max_len_seq
+        self.use_hetero_proj = use_hetero_proj
+        self.soft_prompt_length = soft_prompt_length
+        # Action/proprioception settings
+        self.num_actions = num_actions
+        self.action_mode = action_mode
+        self.use_proprio = use_proprio
+        self.real_action_dim = real_action_dim
+        self.max_action_dim = max_action_dim
+        self.idx_for_delta = idx_for_delta
+        self.idx_for_mask_proprio = idx_for_mask_proprio
+        # Initialize base HF config attributes (e.g. name_or_path)
+        super().__init__(**kwargs)
+    # -------------------------------------------------------------------------
+    # Serialization helpers
+    # -------------------------------------------------------------------------
+    def to_dict(self):
+        """
+        Convert this configuration (and its Florence sub-config)
+        into a fully serializable dictionary for HF save/load.
+        """
+        output = super().to_dict()
+        output["florence_config"] = self.florence_config.to_dict()
+        return output

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9068158357841b0245c85e085cbbb62a033d8b86a8bd26eb721d59ec1902cbd1
+size 3519068172

modeling_florence2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_xvla.py ADDED Viewed

	@@ -0,0 +1,435 @@

+# ------------------------------------------------------------------------------
+# Copyright 2025 2toINF (https://github.com/2toINF)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------
+from __future__ import annotations
+from typing import Any, Dict, List
+import torch
+import numpy as np
+from PIL import Image
+from fastapi import FastAPI
+import cv2
+from transformers import PreTrainedModel
+from .server import ModelServer
+from .modeling_florence2 import Florence2ForConditionalGeneration
+from .transformer import SoftPromptedTransformer
+from .action_hub import build_action_space
+from .configuration_xvla import XVLAConfig
+class XVLA(PreTrainedModel, ModelServer):
+    """
+    XVLA: HuggingFace-compatible Vision-Language-Action policy.
+    Components:
+      • Florence2 encoder-only backbone (vision-language)
+      • SoftPromptedTransformer (temporal/action head)
+      • Action space (pre/post-processing + loss)
+    """
+    config_class = XVLAConfig
+    base_model_prefix = "xvla"
+    supports_gradient_checkpointing = True
+    def __init__(self, config: XVLAConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        # Core settings
+        self.num_actions: int = config.num_actions
+        self.use_proprio: bool = config.use_proprio
+        self.action_mode: str = config.action_mode.lower()
+        # Action space (dimensions + hooks)
+        if config.action_mode.lower() == "auto":
+            self.action_space = build_action_space(
+                config.action_mode.lower(),
+                real_dim=config.real_action_dim,
+                max_dim=config.max_action_dim,
+                idx_for_delta=config.idx_for_delta,
+                idx_for_mask_proprio=config.idx_for_mask_proprio
+            )
+        else:
+            self.action_space = build_action_space(config.action_mode.lower())
+        dim_action = self.action_space.dim_action
+        dim_proprio = getattr(self.action_space, "dim_proprio", dim_action)
+        # Florence2 backbone (encoder only)
+        self.vlm = Florence2ForConditionalGeneration(config.florence_config).to(torch.float32)
+        if hasattr(self.vlm, "language_model"):
+            lm = self.vlm.language_model
+            if hasattr(lm, "model") and hasattr(lm.model, "decoder"):
+                del lm.model.decoder
+            if hasattr(lm, "lm_head"):
+                del lm.lm_head
+        projection_dim = getattr(self.vlm.config, "projection_dim", None)
+        if projection_dim is None:
+            raise ValueError("Florence2 config must provide `projection_dim` for multimodal fusion.")
+        # Temporal/action head
+        self.transformer = SoftPromptedTransformer(
+            hidden_size=config.hidden_size,
+            multi_modal_input_size=projection_dim,
+            depth=config.depth,
+            num_heads=config.num_heads,
+            mlp_ratio=config.mlp_ratio,
+            num_domains=config.num_domains,
+            dim_action=dim_action,
+            dim_propio=dim_proprio,
+            len_soft_prompts=config.len_soft_prompts,
+            dim_time=config.dim_time,
+            max_len_seq=config.max_len_seq,
+            use_hetero_proj=config.use_hetero_proj,
+        )
+        # Deferred FastAPI app
+        self.app: FastAPI | None = None
+    # ========================== pretrained loading ================================
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Load pretrained XVLA, automatically handling action-head dimension
+        mismatches.
+        * Shape-compatible parameters are loaded normally.
+        * Mismatched parameters are logged and explicitly re-initialised
+          (Xavier-uniform for weight, zeros for bias — matching
+          ``DomainAwareLinear.__init__``).
+        """
+        import os
+        import json
+        import logging
+        from collections import OrderedDict
+        logger = logging.getLogger(__name__)
+        config = kwargs.pop("config", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        if config is None:
+            config = cls.config_class.from_pretrained(
+                pretrained_model_name_or_path, **kwargs
+            )
+        model = cls(config, *model_args)
+        if torch_dtype is not None:
+            model = model.to(torch_dtype)
+        pretrained_state = cls._load_pretrained_state_dict(
+            pretrained_model_name_or_path
+        )
+        model_state = model.state_dict()
+        to_load = OrderedDict()
+        mismatched = []
+        for key, param in pretrained_state.items():
+            if key not in model_state:
+                continue
+            if param.shape == model_state[key].shape:
+                to_load[key] = param
+            else:
+                mismatched.append(
+                    (key, tuple(param.shape), tuple(model_state[key].shape))
+                )
+        model.load_state_dict(to_load, strict=False)
+        if mismatched:
+            logger.warning(
+                "=== Mismatched pretrained keys (reinitialized) ===\n"
+                + "\n".join(
+                    f"  {k}: pretrained {ps} -> current {cs}"
+                    for k, ps, cs in mismatched
+                )
+            )
+            for key, _, _ in mismatched:
+                parts = key.split(".")
+                module = model
+                for part in parts[:-1]:
+                    module = getattr(module, part)
+                param = getattr(module, parts[-1])
+                with torch.no_grad():
+                    if "bias" in key:
+                        torch.nn.init.zeros_(param)
+                    elif param.dim() >= 2:
+                        torch.nn.init.xavier_uniform_(param)
+                    else:
+                        torch.nn.init.zeros_(param)
+            logger.warning(
+                "Above %d parameter(s) have been re-initialised.",
+                len(mismatched),
+            )
+        return model
+    @staticmethod
+    def _load_pretrained_state_dict(model_path: str) -> dict:
+        """Load state dict from a local checkpoint (file or directory).
+        Supports single-file, directory, and sharded safetensors / bin.
+        """
+        import os
+        import json
+        from collections import OrderedDict
+        def _load_safetensors(path):
+            from safetensors.torch import load_file
+            return load_file(path)
+        def _load_bin(path):
+            return torch.load(path, map_location="cpu")
+        if os.path.isfile(model_path):
+            if model_path.endswith(".safetensors"):
+                return _load_safetensors(model_path)
+            return _load_bin(model_path)
+        for fname, loader in [
+            ("model.safetensors", _load_safetensors),
+            ("pytorch_model.bin", _load_bin),
+        ]:
+            fpath = os.path.join(model_path, fname)
+            if os.path.isfile(fpath):
+                return loader(fpath)
+        for index_name, loader in [
+            ("model.safetensors.index.json", _load_safetensors),
+            ("pytorch_model.bin.index.json", _load_bin),
+        ]:
+            index_path = os.path.join(model_path, index_name)
+            if os.path.isfile(index_path):
+                with open(index_path) as f:
+                    weight_map = json.load(f)["weight_map"]
+                state_dict = OrderedDict()
+                for shard_file in dict.fromkeys(weight_map.values()):
+                    state_dict.update(
+                        loader(os.path.join(model_path, shard_file))
+                    )
+                return state_dict
+        raise FileNotFoundError(
+            f"No checkpoint found at '{model_path}'. Expected "
+            f"model.safetensors, pytorch_model.bin, or sharded index files."
+        )
+    # ============================= Florence2 encoder =============================
+    def forward_vlm(
+        self,
+        input_ids: torch.LongTensor,        # [B, L]
+        pixel_values: torch.FloatTensor,    # [B, V, C, H, W]
+        image_mask: torch.Tensor,           # [B, V] (bool or 0/1)
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Encode text + multi-view images via Florence2 encoder.
+        Returns:
+          { "vlm_features": [B, T_enc, D], "aux_visual_inputs": [B, (V-1)*N, D] }
+        """
+        B, V = pixel_values.shape[:2]
+        flat_mask = image_mask.view(-1).to(torch.bool)         # [B*V]
+        flat_images = pixel_values.flatten(0, 1)                # [B*V, C, H, W]
+        num_valid = int(flat_mask.sum().item())
+        if num_valid == 0:
+            raise ValueError("At least one image view must be valid per batch.")
+        valid_images = flat_images[flat_mask]                   # [#valid, C, H, W]
+        valid_feats = self.vlm._encode_image(valid_images)      # [#valid, N, D]
+        N, D = valid_feats.shape[1:]
+        image_features = valid_feats.new_zeros((B * V, N, D))
+        image_features[flat_mask] = valid_feats
+        image_features = image_features.view(B, V, N, D)        # [B, V, N, D]
+        inputs_embeds = self.vlm.get_input_embeddings()(input_ids)  # [B, L, D]
+        merged_embeds, attention_mask = self.vlm._merge_input_ids_with_image_features(
+            image_features[:, 0],  # first view: [B, N, D]
+            inputs_embeds,         # [B, L, D]
+        )
+        enc_out = self.vlm.language_model.model.encoder(
+            attention_mask=attention_mask,
+            inputs_embeds=merged_embeds,
+        )[0]  # [B, T_enc, D]
+        aux_visual_inputs = image_features[:, 1:].reshape(B, -1, D)  # remaining views flattened
+        return {"vlm_features": enc_out, "aux_visual_inputs": aux_visual_inputs}
+    # ================================= training =================================
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        image_input: torch.FloatTensor,
+        image_mask: torch.Tensor,
+        domain_id: torch.LongTensor,
+        proprio: torch.Tensor,
+        action: torch.Tensor,  # [B, T=num_actions, D=dim_action]
+    ) -> Dict[str, torch.Tensor]:
+        """
+        1) Encode multimodal inputs.
+        2) Diffusion-style noisy mixture of actions: x_t = t*noise + (1-t)*gt.
+        3) Space-specific preprocessing, prediction, and supervised loss.
+        """
+        action, proprio = self.action_space.prepare_for_training(action, proprio)
+        enc = self.forward_vlm(input_ids, image_input, image_mask)
+        B = input_ids.shape[0]
+        t = (torch.rand(1, device=input_ids.device)
+             + torch.arange(B, device=input_ids.device) / B) % (1 - 1e-5)
+        action_noisy = torch.randn_like(action) * t.view(-1, 1, 1) + action * (1 - t).view(-1, 1, 1)
+        proprio_m, action_noisy_m = self.action_space.preprocess(proprio, action_noisy)
+        pred_action = self.transformer(
+            domain_id=domain_id,
+            action_with_noise=action_noisy_m,
+            t=t,
+            proprio=proprio_m,
+            **enc,
+        )
+        return self.action_space.compute_loss(pred_action, action)
+    # ================================= inference =================================
+    @torch.no_grad()
+    def generate_actions(
+        self,
+        input_ids: torch.LongTensor,
+        image_input: torch.FloatTensor,
+        image_mask: torch.Tensor,
+        domain_id: torch.LongTensor,
+        proprio: torch.Tensor,
+        steps: int = 10,
+    ) -> torch.Tensor:
+        """
+        Iterative denoising (linear schedule).
+        Applies action_space.postprocess at the end (e.g., sigmoid on gripper).
+        """
+        self.eval()
+        enc = self.forward_vlm(input_ids, image_input, image_mask)
+        B = input_ids.shape[0]
+        D = self.action_space.dim_action
+        x1 = torch.randn(B, self.num_actions, D, device=proprio.device, dtype=proprio.dtype)
+        action = torch.zeros_like(x1)
+        steps = max(1, int(steps))
+        for i in range(steps, 0, -1):
+            t = torch.full((B,), i / steps, device=proprio.device, dtype=proprio.dtype)
+            x_t = x1 * t.view(-1, 1, 1) + action * (1 - t).view(-1, 1, 1)
+            proprio_m, x_t_m = self.action_space.preprocess(proprio, x_t)
+            action = self.transformer(
+                domain_id=domain_id,
+                action_with_noise=x_t_m,
+                proprio=proprio_m,
+                t=t,
+                **enc,
+            )
+        return self.action_space.postprocess(action, proprio=proprio)
+    # =============================== FastAPI service =============================
+    def inference_api(self, payload: Dict[str, Any] | List[Dict[str, Any]], **kwargs) -> np.ndarray:
+        """
+        XVLA inference supporting:
+        - Single sample: payload is a dict of scalars/arrays.
+        - Grouped batch: payload is a list of dicts with same-length fields.
+        payload contents:
+        - "language_instruction": str or List[str], optional
+        - "image0", "image1", ... : np.ndarray (H, W, C) or encoded buffer, required
+        - "proprio": np.ndarray (D,) or (B, D), required
+        - "domain_id": int / List[int] if batch > 1, required
+        - "steps": int, optional, default=10
+        - "batch_size": int, optional, default=1
+        Returns:
+        - (T, D) for single sample
+        - (B, T, D) for grouped batch
+        """
+        # -------------------------
+        # 1) Normalize payload -> List[Dict[str, Any]]
+        # -------------------------
+        processor = kwargs.get("processor")
+        if isinstance(payload, dict):
+            batch_payloads: List[Dict[str, Any]] = [payload]
+        batch_size = len(batch_payloads)
+        device = next(self.parameters()).device
+        dtype = next(self.parameters()).dtype
+        # -------------------------
+        # 2) Utilities
+        # -------------------------
+        def move_to_device(x: Any) -> torch.Tensor:
+            """Convert to tensor and move to model device/dtype."""
+            tensor = x if isinstance(x, torch.Tensor) else torch.as_tensor(x)
+            if tensor.is_floating_point():
+                return tensor.to(device=device, dtype=dtype)
+            return tensor.to(device=device)
+        def decode_image_list(sample: Dict[str, Any]) -> List[Image.Image]:
+            """Decode image0/image1/... from np.ndarray into PIL Images."""
+            images: List[Image.Image] = []
+            idx = 0
+            while f"image{idx}" in sample:
+                arr = sample[f"image{idx}"]
+                if not isinstance(arr, np.ndarray): raise ValueError(f"image{idx} must be np.ndarray, got {type(arr)}")
+                if arr.ndim == 1:  # encoded buffer
+                    arr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+                    if arr is None: raise ValueError(f"cv2.imdecode failed for image{idx}")
+                    arr = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
+                images.append(Image.fromarray(arr))
+                idx += 1
+            if not images:
+                raise ValueError("Missing images: expected keys image0, image1, ...")
+            return images
+        # -------------------------
+        # 3) Per-sample preprocessing + strict collation (no padding)
+        # -------------------------
+        language_batch: List[str] = []
+        images_batch: List[List[Image.Image]] = []
+        proprio_batch: List[torch.Tensor] = []
+        domain_id_list: List[int] = []
+        denoiseing_steps = batch_payloads[0].get("steps", 10)
+        for sample in batch_payloads:
+            images_batch.append(decode_image_list(sample))
+            language_batch.append(sample.get("language_instruction", ""))
+            proprio_batch.append(move_to_device(sample["proprio"]))
+            domain_id_list.append(int(sample.get("domain_id", 0)))
+        model_inputs = processor(
+            images=images_batch,
+            language_instruction=language_batch,
+        )
+        model_inputs = {k: move_to_device(v) for k, v in model_inputs.items()}
+        model_inputs.update(
+            proprio=torch.stack(proprio_batch, dim=0),  # (B, state_dim)
+            domain_id=torch.tensor(domain_id_list, dtype=torch.long, device=device),  # (B,)
+            steps=denoiseing_steps,  # one scalar for whole batch
+        )
+        # -------------------------
+        # 4) Inference
+        # -------------------------
+        self.eval()
+        with torch.inference_mode():
+            actions = self.generate_actions(**model_inputs)  # expected: (B, T, D)
+        actions_np = actions.float().cpu().numpy()
+        return actions_np[0] if batch_size == 1 else actions_np

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_xvla.XVLAProcessor"
+  },
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": false,
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "processor_class": "XVLAProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
+}

processing_xvla.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# ------------------------------------------------------------------------------
+# Copyright 2025 2toINF (https://github.com/2toINF)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------
+from transformers import ProcessorMixin
+from typing import List, Union, Dict, Any, Optional
+import torch
+class XVLAProcessor(ProcessorMixin):
+    """
+    XVLAProcessor: Unified multimodal processor for XVLA models.
+    Handles:
+      - Multi-view image inputs (e.g., from multiple cameras).
+      - Batch processing for multiple samples.
+      - Joint tokenization and image tensor preparation.
+    This processor combines an image processor and a tokenizer under a single interface
+    so that users can call it directly like:
+        >>> processor = XVLAProcessor.from_pretrained("path/to/xvla")
+        >>> inputs = processor(images=batch_images, language_instruction=batch_texts)
+    It is fully compatible with the Hugging Face AutoProcessor API.
+    Attributes
+    ----------
+    num_views : int, default=3
+        Expected number of image views per sample. Missing views will be padded with zeros.
+    language_max_length : int, default=50
+        Maximum token length for text encoding.
+    attributes : list
+        Required by ProcessorMixin to know which submodules are stored and reloaded.
+    image_processor_class : str
+        The name of the associated image processor class.
+    tokenizer_class : tuple(str)
+        The names of compatible tokenizer classes.
+    """
+    num_views: int = 3
+    language_max_length: int = 50
+    # Hugging Face ProcessorMixin-required metadata
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None):
+        """
+        Initialize XVLAProcessor.
+        Parameters
+        ----------
+        image_processor : PreTrainedImageProcessor, optional
+            The image processor used to normalize/resize images.
+        tokenizer : PreTrainedTokenizer, optional
+            The tokenizer used for text tokenization.
+        """
+        # ProcessorMixin automatically saves these under self.image_processor / self.tokenizer
+        super().__init__(image_processor, tokenizer)
+    # ================== LANGUAGE ENCODING ==================
+    def encode_language(self, language_instruction: Union[str, List[str]]) -> Dict[str, torch.Tensor]:
+        """
+        Tokenize one or more language instructions.
+        Parameters
+        ----------
+        language_instruction : str or List[str]
+            A single instruction or a batch of instructions.
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            {
+              "input_ids": tensor of shape [B, L]
+            }
+        """
+        if isinstance(language_instruction, str):
+            language_instruction = [language_instruction]
+        inputs = self.tokenizer(
+            language_instruction,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=self.language_max_length,
+            truncation=True,
+        )
+        return {"input_ids": inputs["input_ids"]}
+    # ================== IMAGE ENCODING ==================
+    def encode_image(
+        self,
+        images: Union[List, List[List]],
+        **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Preprocess one or more sets of multi-view images.
+        Parameters
+        ----------
+        images : List or List[List]
+            Single sample: [img1, img2, ...]
+            Batch: [[img1a, img1b], [img2a, img2b, img2c], ...]
+            Each image may be a PIL.Image, NumPy array, or torch.Tensor.
+        kwargs : dict
+            Extra arguments passed to the underlying image processor
+            (e.g., `do_resize=False`, `size=(224,224)`).
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            {
+              "image_input": tensor [B, num_views, C, H, W],
+              "image_mask": tensor [B, num_views]
+            }
+        """
+        # Normalize to batch form
+        if not isinstance(images[0], (list, tuple)):
+            images = [images]  # convert single sample to batch of size 1
+        batch_imgs, batch_masks = [], []
+        for sample_imgs in images:
+            processed = self.image_processor(sample_imgs, return_tensors="pt", **kwargs)["pixel_values"]
+            V_exist = processed.size(0)
+            # Pad to self.num_views
+            if V_exist < self.num_views:
+                processed = torch.cat(
+                    [processed,
+                     processed.new_zeros(self.num_views - V_exist, *processed.shape[1:])],
+                    dim=0,
+                )
+            # Mask: True for valid slots, False for padding
+            image_mask = torch.zeros(self.num_views, dtype=torch.bool, device=processed.device)
+            image_mask[:V_exist] = True
+            batch_imgs.append(processed)
+            batch_masks.append(image_mask)
+        image_input = torch.stack(batch_imgs, dim=0)  # [B, num_views, C, H, W]
+        image_mask = torch.stack(batch_masks, dim=0)  # [B, num_views]
+        return {"image_input": image_input, "image_mask": image_mask}
+    # ================== COMBINED CALL ==================
+    def __call__(
+        self,
+        images: Optional[Union[List, List[List]]] = None,
+        language_instruction: Optional[Union[str, List[str]]] = None,
+        **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Combine image and text encoding into a unified multimodal input.
+        Parameters
+        ----------
+        images : List or List[List], optional
+            Single-sample or batched multi-view images.
+        language_instruction : str or List[str], optional
+            Corresponding text instructions.
+        kwargs : dict
+            Extra args passed to image processor.
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            {
+              "input_ids": [B, L], optional,
+              "image_input": [B, num_views, C, H, W], optional,
+              "image_mask": [B, num_views], optional
+            }
+        """
+        outputs: Dict[str, Any] = {}
+        # Encode language if provided
+        if language_instruction is not None:
+            outputs.update(self.encode_language(language_instruction))
+        # Encode image if provided
+        if images is not None:
+            outputs.update(self.encode_image(images, **kwargs))
+        # Sanity check for batch alignment
+        if "input_ids" in outputs and "image_input" in outputs:
+            assert outputs["input_ids"].size(0) == outputs["image_input"].size(0), (
+                f"Batch mismatch: text batch {outputs['input_ids'].size(0)} "
+                f"!= image batch {outputs['image_input'].size(0)}"
+            )
+        return outputs

server.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from typing import Any, Dict
+import logging
+import traceback
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from fastapi.responses import JSONResponse
+import uvicorn
+import json_numpy
+import msgpack
+import msgpack_numpy as m
+from abc import ABC, abstractmethod
+m.patch()
+class ModelServer(ABC):
+    def __init__(self):
+        self.app: FastAPI | None = None
+    @abstractmethod
+    def inference_api(self, payload: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        """
+        Abstract method for model inference API.
+        Parameters
+        ----------
+        payload : Dict[str, Any]
+            The input payload for inference.
+        Returns
+        -------
+        Dict[str, Any]
+            The inference result.
+        """
+        pass
+    def _build_app(self, **infer_kwargs):
+        """
+        Minimal FastAPI app for XVLA inference.
+        kwargs are passed to inference_api.
+        """
+        if self.app is not None: return
+        app = FastAPI()
+        # ODL VERSION With Json Response
+        @app.post("/act")
+        def act(payload: Dict[str, Any]):
+            try:
+                for key, value in payload.items():
+                    if isinstance(value, (str, bytes)):
+                        try: payload[key] = json_numpy.loads(value)
+                        except Exception: pass
+                action = self.inference_api(payload, **infer_kwargs)
+                return JSONResponse({"action": action.tolist()})
+            except Exception:
+                logging.error(traceback.format_exc())
+                return JSONResponse({"error": "Request failed"}, status_code=400)
+        @app.websocket("/act")
+        async def websocket_endpoint(websocket: WebSocket):
+            await websocket.accept()
+            await websocket.send_bytes(msgpack.packb({"type": "welcome", "ok": True},
+                                                     use_bin_type=True))
+            try:
+                while True:
+                    data = await websocket.receive_bytes()
+                    payload = msgpack.unpackb(data, raw=False)
+                    try: action_pred = self.inference_api(payload, **infer_kwargs)
+                    except Exception as e:
+                        logging.error(traceback.format_exc())
+                        response = {"error": f"Inference failed: {e}"}
+                        await websocket.send_bytes(msgpack.packb(response, use_bin_type=True))
+                        continue
+                    # 4. Pack & Send Response
+                    response = {"action": action_pred}
+                    await websocket.send_bytes(msgpack.packb(response, use_bin_type=True))
+            except WebSocketDisconnect:
+                logging.info("WS disconnected")
+            except Exception:
+                logging.error(traceback.format_exc())
+        self.app = app
+    def run(self, host: str = "0.0.0.0", port: int = 8000, **kwargs):
+        """
+        Launch the FastAPI service.
+        """
+        logging.info(f"🚀 XVLAServer listening on http://{host}:{port}/act")
+        logging.info(f"🚀 XVLAServer listening on ws://{host}:{port}/act")
+        self._build_app(**kwargs)
+        assert self.app is not None
+        uvicorn.run(self.app,
+                    host=host,
+                    port=port,
+                    log_level="info",
+                    ws_ping_interval=20,
+                    ws_ping_timeout=20)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"global_step": 200000}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 1024,
+  "pad_token": "<pad>",
+  "processor_class": "XVLAProcessor",
+  "sep_token": "</s>",
+  "tokenizer_class": "BartTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

transformer.py ADDED Viewed

	@@ -0,0 +1,403 @@

+# ------------------------------------------------------------------------------
+# Copyright 2025 2toINF (https://github.com/2toINF)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------
+from __future__ import annotations
+import math
+from functools import partial
+from typing import Final, Iterable, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# ------------------------------- Small utils ----------------------------------
+def _to_2tuple(x) -> Tuple:
+    """Minimal replacement for timm.layers.to_2tuple."""
+    if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
+        t = tuple(x)
+        return (t[0], t[1]) if len(t) >= 2 else (t[0], t[0])
+    return (x, x)
+def _has_sdp_attention() -> bool:
+    """Check if we can use PyTorch fused scaled_dot_product_attention."""
+    return hasattr(F, "scaled_dot_product_attention")
+# ---------------------------------- MLP --------------------------------------
+class Mlp(nn.Module):
+    """
+    MLP used in ViT-style blocks.
+    Supports Linear or 1x1 Conv 'linear_layer' for token/channel mixing.
+    """
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int | None = None,
+        out_features: int | None = None,
+        norm_layer: type[nn.Module] | None = None,
+        bias: bool | Tuple[bool, bool] = True,
+        drop: float | Tuple[float, float] = 0.0,
+        use_conv: bool = False,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = _to_2tuple(bias)
+        drop_probs = _to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = nn.GELU(approximate="tanh")
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Expect [B, T, C] for Linear variant; caller is responsible for shapes.
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+# -------------------------------- Attention ----------------------------------
+class Attention(nn.Module):
+    """
+    Multi-Head Self-Attention with optional fused SDPA fallback.
+    If PyTorch provides `scaled_dot_product_attention`, it will be used
+    (usually faster and more stable); otherwise we use a manual implementation.
+    """
+    fused_attn: Final[bool]
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: type[nn.Module] = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = _has_sdp_attention()
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x : Tensor, shape [B, T, C]
+            Input sequence.
+        Returns
+        -------
+        Tensor, shape [B, T, C]
+            Output sequence after MHSA + projection.
+        """
+        B, T, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, T, 3, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)  # 3 x [B, H, T, Dh]
+        )
+        q, k, v = qkv.unbind(0)  # each: [B, H, T, Dh]
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.0,
+            )  # [B, H, T, Dh]
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)        # [B, H, T, T]
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v                           # [B, H, T, Dh]
+        x = x.transpose(1, 2).reshape(B, T, C)     # [B, T, C]
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+# ------------------------------- Utilities -----------------------------------
+def basic_init(module: nn.Module) -> None:
+    """
+    Apply a basic initialization scheme to Linear layers.
+    - Weight: Xavier uniform initialization.
+    - Bias: Set to zero.
+    """
+    if isinstance(module, nn.Linear):
+        nn.init.xavier_uniform_(module.weight)
+        if module.bias is not None:
+            nn.init.constant_(module.bias, 0.0)
+def timestep_embedding(t: torch.Tensor, dim: int, max_period: int = 100) -> torch.Tensor:
+    """
+    Create sinusoidal timestep embeddings.
+    Parameters
+    ----------
+    t : torch.Tensor
+        Shape [B]. Each element is a timestep index, may be fractional.
+    dim : int
+        Dimensionality of the output embedding.
+    max_period : int, default=100
+        Controls the minimum frequency of the sinusoids.
+    Returns
+    -------
+    torch.Tensor
+        Shape [B, dim]. Sinusoidal embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=t.dtype, device=t.device)
+        / half
+    )
+    args = t[:, None] * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2 == 1:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+# ------------------------------- Core Layers ----------------------------------
+class DomainAwareLinear(nn.Module):
+    """
+    Linear layer with domain-conditioned parameters (per-sample).
+    Each domain has its own weight and bias vectors, stored in embeddings.
+    """
+    def __init__(self, input_size: int, output_size: int, num_domains: int = 20) -> None:
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.fc = nn.Embedding(num_domains, output_size * input_size)
+        self.bias = nn.Embedding(num_domains, output_size)
+        nn.init.xavier_uniform_(self.fc.weight)
+        nn.init.zeros_(self.bias.weight)
+    def forward(self, x: torch.Tensor, domain_id: torch.LongTensor) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x : Tensor
+            [B, I] or [B, T, I]
+        domain_id : LongTensor
+            [B], domain indices.
+        Returns
+        -------
+        Tensor
+            [B, O] or [B, T, O]
+        """
+        B = domain_id.shape[0]
+        squeeze_T = False
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+            squeeze_T = True
+        W = self.fc(domain_id).view(B, self.input_size, self.output_size)
+        b = self.bias(domain_id).view(B, self.output_size)
+        y = torch.matmul(x, W) + b.view(B, 1, self.output_size)
+        if squeeze_T:
+            y = y.squeeze(1)
+        return y
+class TransformerBlock(nn.Module):
+    """
+    Standard Transformer block (pre-LN): LN → MHSA → residual, LN → MLP → residual.
+    """
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float = 4.0) -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size)
+        self.norm2 = nn.LayerNorm(hidden_size)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, attn_drop=0.1)
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=int(hidden_size * mlp_ratio),
+            drop=0.1,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x : Tensor, [B, T, H]
+        Returns
+        -------
+        Tensor, [B, T, H]
+        """
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+# --------------------------- Main Model ---------------------------------------
+class SoftPromptedTransformer(nn.Module):
+    """
+    Multi-modal, domain-aware Transformer with optional soft prompts.
+    See parameter and forward I/O descriptions inside the docstrings.
+    """
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        multi_modal_input_size: int = 768,
+        depth: int = 24,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        num_domains: int = 20,
+        dim_action: int = 20,
+        dim_propio: int = 20,
+        dim_time: int = 32,
+        len_soft_prompts: int = 32,
+        max_len_seq: int = 512,
+        use_hetero_proj: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.dim_action = dim_action
+        self.dim_time = dim_time
+        self.len_soft_prompts = len_soft_prompts
+        self.use_hetero_proj = use_hetero_proj
+        self.blocks = nn.ModuleList(
+            [TransformerBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)]
+        )
+        if use_hetero_proj:
+            self.vlm_proj = DomainAwareLinear(multi_modal_input_size, hidden_size, num_domains=num_domains)
+            self.aux_visual_proj = DomainAwareLinear(multi_modal_input_size, hidden_size, num_domains=num_domains)
+        else:
+            self.vlm_proj = nn.Linear(multi_modal_input_size, hidden_size)
+            self.aux_visual_proj = nn.Linear(multi_modal_input_size, hidden_size)
+        self.pos_emb = nn.Parameter(torch.zeros(1, max_len_seq, hidden_size), requires_grad=True)
+        nn.init.normal_(self.pos_emb, std=0.02)
+        self.norm = nn.LayerNorm(hidden_size)
+        self.action_encoder = DomainAwareLinear(
+            dim_action + dim_time + dim_propio, hidden_size, num_domains=num_domains
+        )
+        self.action_decoder = DomainAwareLinear(hidden_size, dim_action, num_domains=num_domains)
+        if len_soft_prompts > 0:
+            self.soft_prompt_hub = nn.Embedding(num_domains, len_soft_prompts * hidden_size)
+            nn.init.normal_(self.soft_prompt_hub.weight, std=0.02)
+        self.apply(basic_init)
+    def forward(
+        self,
+        domain_id: torch.LongTensor,
+        vlm_features: torch.Tensor,
+        aux_visual_inputs: torch.Tensor,
+        action_with_noise: torch.Tensor,
+        proprio: torch.Tensor,
+        t: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Forward pass.
+        Inputs
+        ------
+        domain_id : [B]
+        vlm_features : [B, T_vlm, D]
+        aux_visual_inputs : [B, T_aux, D]
+        action_with_noise : [B, T_action, dim_action]
+        proprio : [B, dim_propio]
+        t : [B]
+        Returns
+        -------
+        Tensor
+            Predicted actions, [B, T_action, dim_action]
+        """
+        B, num_actions = action_with_noise.shape[:2]
+        # Encode (action + proprio + time) → tokens
+        time_emb = timestep_embedding(t, self.dim_time)                     # [B, dim_time]
+        time_tokens = time_emb.unsqueeze(1).expand(B, num_actions, self.dim_time)
+        proprio_tokens = proprio.unsqueeze(1).expand(B, num_actions, proprio.shape[-1])
+        action_tokens = torch.cat([action_with_noise, proprio_tokens, time_tokens], dim=-1)
+        x = self.action_encoder(action_tokens, domain_id)                   # [B, T_action, H]
+        # Project visual streams and concatenate
+        if self.use_hetero_proj:
+            x = torch.cat(
+                [x, self.vlm_proj(vlm_features, domain_id), self.aux_visual_proj(aux_visual_inputs, domain_id)],
+                dim=1,
+            )
+        else:
+            x = torch.cat([x, self.vlm_proj(vlm_features), self.aux_visual_proj(aux_visual_inputs)], dim=1)
+        # Add positional embeddings (truncate if needed)
+        seq_len = x.shape[1]
+        if seq_len > self.pos_emb.shape[1]:
+            raise ValueError(
+                f"Sequence length {seq_len} exceeds max_len_seq={self.pos_emb.shape[1]}."
+            )
+        x = x + self.pos_emb[:, :seq_len, :]
+        # Append soft prompts
+        if self.len_soft_prompts > 0:
+            soft_prompts = self.soft_prompt_hub(domain_id).view(B, self.len_soft_prompts, self.hidden_size)
+            x = torch.cat([x, soft_prompts], dim=1)
+        # Transformer backbone
+        for block in self.blocks:
+            x = block(x)
+        # Decode only the action segment
+        return self.action_decoder(self.norm(x[:, :num_actions]), domain_id)

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff