Duplicate from AdrianLlopart/rskill-molmoact2-so101-nf4

Browse files

Files changed (16) hide show

.gitattributes +38 -0
README.md +192 -0
config.json +153 -0
configuration_molmoact2.py +543 -0
generation_config.json +6 -0
image_processing_molmoact2.py +546 -0
inference.py +768 -0
model.safetensors +3 -0
modeling_molmoact2.py +0 -0
norm_stats.json +202 -0
processing_molmoact2.py +418 -0
processor_config.json +85 -0
quantization_metadata.json +14 -0
tokenizer.json +3 -0
tokenizer_config.json +34 -0
video_processing_molmoact2.py +969 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,38 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/sample_realsense_top_rgb.png filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+assets/sample_realsense_side_rgb.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,192 @@

+---
+library_name: transformers
+tags:
+  - molmoact2
+  - robotics
+  - image-text-to-text
+  - so100
+  - so101
+---
+<img src="assets/MolmoAct2.svg" alt="MolmoAct Logo" height="50">
+# **MolmoAct2-SO100_101**
+MolmoAct2 is an open vision-language-action model for robot control. It builds on Molmo2-ER and attaches a flow-matching continuous action expert that conditions on the VLM key-value cache through a per-layer connection.
+This checkpoint is fine-tuned on the SO-100/101 mixture with absolute joint-pose control and annotated language instructions. It is intended for both further fine-tuning and SO-100/101 policy inference.
+## Quick Links
+- 📂 Models: [Models](https://huggingface.co/collections/allenai/molmoact2-models), [Finetuned Models](https://huggingface.co/collections/allenai/molmoact2-finetuned-models)
+- 📂 Datasets: [MolmoAct2-BimanualYAM Dataset](https://huggingface.co/collections/allenai/molmoact2-datasets), [MolmoAct2 Datasets](https://huggingface.co/collections/allenai/molmoact2-datasets), [Molmo2-ER Datasets](https://huggingface.co/collections/allenai/molmo2-er-datasets)
+- 📄 Paper: [arXiv:2605.02881](https://arxiv.org/abs/2605.02881)
+- 💻 Code: [allenai/molmoact2](https://github.com/allenai/molmoact2)
+- 🎥 Blog Post: [MolmoAct2](https://allenai.org/blog/molmoact2)
+## Intended Use
+Use this checkpoint for SO-100/101 inference or for further fine-tuning. Dataset normalization metadata is stored in `norm_stats.json`. pass `norm_tag="so100_so101_molmoact2"` at inference time.
+Continuous action prediction is the intended and recommended inference mode. Discrete action prediction is exposed for parity and debugging, but we use continuous actions by default.
+## Install
+```bash
+pip install torch transformers pillow numpy huggingface_hub
+```
+## Sample Input
+This sample comes from `Beegbrain/pick_lemon_and_drop_in_bowl`, episode 0, frame 0. Camera order for this checkpoint does not matter. random camera order is acceptable.
+| Realsense Top RGB | Realsense Side RGB |
+| --- | --- |
+| ![Sample realsense top RGB](assets/sample_realsense_top_rgb.png) | ![Sample realsense side RGB](assets/sample_realsense_side_rgb.png) |
+```python
+from huggingface_hub import hf_hub_download
+from PIL import Image
+import numpy as np
+repo_id = "allenai/MolmoAct2-SO100_101"
+top_rgb = Image.open(
+    hf_hub_download(repo_id, "assets/sample_realsense_top_rgb.png")
+).convert("RGB")
+side_rgb = Image.open(
+    hf_hub_download(repo_id, "assets/sample_realsense_side_rgb.png")
+).convert("RGB")
+task = "Move the arm towards the lemon, grasp it, lift it up, and drop it into the red bowl."
+robot_state = np.array(
+    [
+        -0.52734375,
+        189.140625,
+        181.40625,
+        60.64453125,
+        -3.603515625,
+        1.0971786975860596,
+    ],
+    dtype=np.float32,
+)
+```
+## Continuous Actions
+```python
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from transformers import AutoModelForImageTextToText, AutoProcessor
+repo_id = "allenai/MolmoAct2-SO100_101"
+top_rgb = Image.open(
+    hf_hub_download(repo_id, "assets/sample_realsense_top_rgb.png")
+).convert("RGB")
+side_rgb = Image.open(
+    hf_hub_download(repo_id, "assets/sample_realsense_side_rgb.png")
+).convert("RGB")
+task = "Move the arm towards the lemon, grasp it, lift it up, and drop it into the red bowl."
+robot_state = np.array(
+    [
+        -0.52734375,
+        189.140625,
+        181.40625,
+        60.64453125,
+        -3.603515625,
+        1.0971786975860596,
+    ],
+    dtype=np.float32,
+)
+processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
+model = AutoModelForImageTextToText.from_pretrained(
+    repo_id,
+    trust_remote_code=True,
+    dtype=torch.float32,
+).to("cuda").eval()
+out = model.predict_action(
+    processor=processor,
+    images=[top_rgb, side_rgb],
+    task=task,
+    state=robot_state,
+    norm_tag="so100_so101_molmoact2",
+    inference_action_mode="continuous",
+    enable_depth_reasoning=False,
+    num_steps=10,
+    normalize_language=True,
+    enable_cuda_graph=True,
+)
+actions = out.actions
+```
+MolmoAct2 was trained with mixed precision. For our reported experiments, we ran inference in `float32`. This path uses the most GPU memory: roughly 26GB with CUDA graph enabled, or around 24GB without CUDA graph.
+If you have a GPU with less memory, you can run inference with `bfloat16` instead:
+```python
+model = AutoModelForImageTextToText.from_pretrained(
+    repo_id,
+    trust_remote_code=True,
+    dtype=torch.bfloat16,
+).to("cuda").eval()
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+    out = model.predict_action(...)
+```
+Using `bfloat16` is much more memory efficient and can run under 16GB of GPU memory in our tests. It usually does not hurt performance much.
+Images may be PIL images or RGB arrays. Camera order does not need to be fixed for this checkpoint. random camera order is acceptable. `state` is the raw robot state, and actions are returned in robot scale.
+`normalize_language=True` is the default. It lowercases the task string and removes trailing sentence punctuation to match training preprocessing. Set it to `False` if you need to preserve the task text exactly.
+`enable_cuda_graph=True` is the default. The first few calls can be slow because the model warms up and captures CUDA graphs. run several random warm-up calls before measuring deployment latency. `num_steps` controls the continuous flow solver and defaults to the checkpoint config value, 10.
+Depth reasoning is disabled for this checkpoint. Calling `enable_depth_reasoning=True` will raise an error.
+## Discrete Actions
+Discrete action inference requires a caller-provided action tokenizer. It is not saved in this repository. Discrete mode decodes action tokens directly. the continuous action expert is not used.
+```python
+action_tokenizer = AutoProcessor.from_pretrained(
+    "allenai/MolmoAct2-FAST-Tokenizer",
+    trust_remote_code=True,
+)
+out = model.predict_action(
+    processor=processor,
+    images=[top_rgb, side_rgb],
+    task=task,
+    state=robot_state,
+    norm_tag="so100_so101_molmoact2",
+    inference_action_mode="discrete",
+    action_tokenizer=action_tokenizer,
+    enable_depth_reasoning=False,
+)
+```
+## Model and Hardware Safety
+MolmoAct2 generate robot actions from visual observations and language instructions, but their behavior may vary across embodiments, environments, and hardware configurations. Users should carefully validate model outputs before deployment, especially when operating physical robots or other actuated systems. Where possible, actions should be monitored through interpretable intermediate outputs (adaptive depth map), simulation rollouts, action limits, or other safety checks before execution on hardware. The model’s action space should be bounded by the training data, robot controller limits, and task-specific safety constraints, including limits on speed, workspace, torque, and contact force. Users should follow the hardware manufacturer’s safety guidelines, use appropriate emergency-stop mechanisms, and operate the system only in a safely configured environment with human supervision.
+## Citation
+```bibtex
+@misc{fang2026molmoact2actionreasoningmodels,
+      title={MolmoAct2: Action Reasoning Models for Real-world Deployment},
+      author={Haoquan Fang and Jiafei Duan and Donovan Clay and Sam Wang and Shuo Liu and Weikai Huang and Xiang Fan and Wei-Chuan Tsai and Shirui Chen and Yi Ru Wang and Shanli Xing and Jaemin Cho and Jae Sung Park and Ainaz Eftekhar and Peter Sushko and Karen Farley and Angad Wadhwa and Cole Harrison and Winson Han and Ying-Chun Lee and Eli VanderBilt and Rose Hendrix and Suveen Ellawela and Lucas Ngoo and Joyce Chai and Zhongzheng Ren and Ali Farhadi and Dieter Fox and Ranjay Krishna},
+      year={2026},
+      eprint={2605.02881},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2605.02881},
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,153 @@

+{
+  "action_end_token_id": 151933,
+  "action_expert_config": {
+    "attn_dropout": 0.0,
+    "causal_attn": false,
+    "context_layer_norm": true,
+    "dropout": 0.0,
+    "ffn_multiple_of": 256,
+    "hidden_size": 768,
+    "mlp_ratio": 4.0,
+    "model_type": "molmoact2_action_expert",
+    "num_heads": 8,
+    "num_layers": 36,
+    "qk_norm": true,
+    "qk_norm_eps": 1e-06,
+    "rope": true,
+    "timestep_embed_dim": 256
+  },
+  "action_expert_depth_gate": false,
+  "action_expert_depth_gate_init_bias": -4.0,
+  "action_expert_depth_gate_per_layer": false,
+  "action_mode": "both",
+  "max_action_horizon": 30,
+  "action_output_token_id": 151931,
+  "action_start_token_id": 151932,
+  "action_token_start_id": 151934,
+  "adapter_config": {
+    "attention_dropout": 0.0,
+    "attn_implementation": "sdpa",
+    "float32_attention": true,
+    "head_dim": 72,
+    "hidden_act": "silu",
+    "hidden_size": 1152,
+    "image_feature_dropout": 0.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "model_type": "molmoact2",
+    "num_attention_heads": 16,
+    "num_key_value_heads": 16,
+    "pooling_attention_mask": true,
+    "residual_dropout": 0.0,
+    "text_hidden_size": 2560,
+    "vit_layers": [
+      -3,
+      -9
+    ]
+  },
+  "add_action_expert": true,
+  "add_control_tokens": true,
+  "add_setup_tokens": true,
+  "architectures": [
+    "MolmoAct2ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_molmoact2.MolmoAct2Config",
+    "AutoModelForImageTextToText": "modeling_molmoact2.MolmoAct2ForConditionalGeneration"
+  },
+  "depth_end_token_id": null,
+  "depth_mode": 2,
+  "depth_output_token_id": null,
+  "depth_start_token_id": null,
+  "depth_token_start_id": null,
+  "dtype": "float32",
+  "enable_depth_reasoning": false,
+  "flow_matching_beta_alpha": 1.0,
+  "flow_matching_beta_beta": 1.5,
+  "flow_matching_cutoff": 1.0,
+  "flow_matching_num_steps": 10,
+  "flow_matching_time_offset": 0.001,
+  "flow_matching_time_scale": 0.999,
+  "frame_end_token_id": 154632,
+  "frame_start_token_id": 154631,
+  "image_col_id": 154627,
+  "image_end_token_id": 154625,
+  "image_high_res_id": 154626,
+  "image_low_res_id": 154630,
+  "image_patch_id": 154626,
+  "image_start_token_id": 154624,
+  "initializer_range": 0.02,
+  "low_res_image_start_token_id": 154628,
+  "mask_action_dim_padding": true,
+  "max_action_dim": 32,
+  "model_type": "molmoact2",
+  "n_obs_steps": 1,
+  "norm_stats_filename": "norm_stats.json",
+  "num_action_tokens": 2048,
+  "num_depth_codes": 100,
+  "num_depth_tokens": 0,
+  "num_state_tokens": 256,
+  "state_end_token_id": 151674,
+  "state_format": "discrete",
+  "state_start_token_id": 151673,
+  "state_token_start_id": 151675,
+  "text_config": {
+    "additional_vocab_size": 128,
+    "attention_dropout": 0.0,
+    "attn_implementation": "sdpa",
+    "embedding_dropout": 0.0,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_norm_eps": 1e-06,
+    "max_position_embeddings": 16384,
+    "model_type": "molmoact2_text",
+    "norm_after": false,
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "qk_norm_type": "qwen3",
+    "qkv_bias": false,
+    "residual_dropout": 0.0,
+    "rope_parameters": {
+      "rope_theta": 5000000.0,
+      "rope_type": "default"
+    },
+    "rope_scaling_layers": null,
+    "rope_theta": 5000000.0,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_qk_norm": true,
+    "vocab_size": 154624
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.3.0",
+  "use_frame_special_tokens": true,
+  "vit_config": {
+    "attention_dropout": 0.0,
+    "attn_implementation": "sdpa",
+    "float32_attention": true,
+    "head_dim": 72,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_default_input_size": [
+      378,
+      378
+    ],
+    "image_num_pos": 729,
+    "image_patch_size": 14,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "molmoact2",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "num_key_value_heads": 16,
+    "residual_dropout": 0.0
+  },
+  "bos_token_id": 151645,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643
+}

configuration_molmoact2.py ADDED Viewed

	@@ -0,0 +1,543 @@

+"""
+MolmoAct2 configuration
+"""
+from typing import Optional, Any
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MolmoAct2VitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2VisionTransformer`].
+    It is used to instantiate a `MolmoAct2VisionTransformer` according to the specified arguments,
+    defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2VitConfig, MolmoAct2VisionTransformer
+    >>> # Initializing a MolmoAct2VitConfig
+    >>> configuration = MolmoAct2VitConfig()
+    >>> # Initializing a MolmoAct2VisionTransformer (with random weights)
+    >>> model = MolmoAct2VisionTransformer(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact2"
+    base_config_key = "vit_config"
+    def __init__(
+        self,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        num_hidden_layers: int = 27,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        hidden_act: str = "gelu_pytorch_tanh",
+        layer_norm_eps: float = 1e-6,
+        image_default_input_size: tuple[int, int] = (378, 378),
+        image_patch_size: int = 14,
+        image_num_pos: int = 577,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        float32_attention: bool = True,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(
+            attn_implementation=attn_implementation,
+            **kwargs
+        )
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.image_default_input_size = image_default_input_size
+        self.image_patch_size = image_patch_size
+        self.image_num_pos = image_num_pos
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.initializer_range = initializer_range
+        self.float32_attention = float32_attention
+    @property
+    def image_num_patch(self):
+        h, w = self.image_default_input_size
+        return h // self.image_patch_size, w // self.image_patch_size
+class MolmoAct2AdapterConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of MolmoAct2Adapter. With MolmoAct2VitConfig,
+    It is used to instantiate an MolmoAct2VisionBackbone according to the specified arguments,
+    defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2VisionBackbone
+    >>> # Initializing a MolmoAct2VitConfig and a MolmoAct2AdapterConfig
+    >>> vit_config = MolmoAct2VitConfig()
+    >>> adapter_config = MolmoPoolingConfig()
+    >>> # Initializing a MolmoAct2VisionBackbone (with random weights)
+    >>> model = MolmoAct2VisionBackbone(vit_config, adapter_config)
+    >>> # Accessing the model configuration
+    >>> vit_configuration = model.vit_config
+    >>> adapter_configuration = model.adapter_config
+    ```"""
+    model_type = "molmoact2"
+    base_config_key = "adapter_config"
+    def __init__(
+        self,
+        vit_layers: tuple = (-3, -9),
+        pooling_attention_mask: bool = False,
+        hidden_size: int = 1152,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        float32_attention: bool = True,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        hidden_act: str = "silu",
+        intermediate_size: int = 18944,
+        text_hidden_size: int = 3584,
+        image_feature_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(
+            attn_implementation=attn_implementation,
+            **kwargs
+        )
+        self.vit_layers = vit_layers
+        self.pooling_attention_mask = pooling_attention_mask
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.float32_attention = float32_attention
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.text_hidden_size = text_hidden_size
+        self.image_feature_dropout = image_feature_dropout
+        self.initializer_range = initializer_range
+class MolmoAct2TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2TextModel`]. It is used to instantiate a
+    `MolmoAct2TextModel` according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2TextConfig, MolmoAct2TextModel
+    >>> # Initializing a MolmoAct2TextConfig
+    >>> configuration = MolmoAct2TextConfig()
+    >>> # Initializing a MolmoAct2TextModel (with random weights)
+    >>> model = MolmoAct2TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact2_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "blocks.*.self_attn.att_proj": "colwise",
+        "blocks.*.self_attn.attn_out": "rowwise",
+        "blocks.*.mlp.ff_proj": "colwise",
+        "blocks.*.mlp.ff_out": "rowwise",
+    }
+    base_model_pp_plan = {
+        "wte": (["input_ids"], ["inputs_embeds"]),
+        "blocks": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "ln_f": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        hidden_size: int = 3584,
+        num_attention_heads: int = 28,
+        num_key_value_heads: Optional[int] = 4,
+        head_dim: int = 128,
+        vocab_size: int = 152064,
+        additional_vocab_size: int = 128,
+        qkv_bias: bool = True,
+        num_hidden_layers: int = 48,
+        intermediate_size: int = 18944,
+        hidden_act: str = "silu",
+        embedding_dropout: float=0.0,
+        attention_dropout: float=0.0,
+        residual_dropout: float = 0.0,
+        max_position_embeddings: int = 4096,
+        rope_theta: float = 1000000.0,
+        rope_scaling: dict[str, Any] = None,
+        rope_scaling_layers: Optional[list[int]] = None,
+        use_qk_norm: bool = False,
+        qk_norm_type: str = "olmo",
+        layer_norm_eps: int = 1e-6,
+        norm_after: bool = False,
+        initializer_range: float = 0.02,
+        use_cache=True,
+        tie_word_embeddings=False,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            attn_implementation=attn_implementation,
+            **kwargs
+        )
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.vocab_size = vocab_size
+        self.additional_vocab_size = additional_vocab_size
+        self.qkv_bias = qkv_bias
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.embedding_dropout = embedding_dropout
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.rope_scaling_layers = rope_scaling_layers
+        self.use_qk_norm = use_qk_norm
+        self.qk_norm_type = qk_norm_type
+        self.layer_norm_eps = layer_norm_eps
+        self.norm_after = norm_after
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+class MolmoAct2ActionExpertConfig(PretrainedConfig):
+    r"""Configuration for the MolmoAct2 modern action expert."""
+    model_type = "molmoact2_action_expert"
+    base_config_key = "action_expert_config"
+    def __init__(
+        self,
+        max_action_horizon: int = 32,
+        max_action_dim: int = 32,
+        hidden_size: int = 1024,
+        num_layers: int = 32,
+        num_heads: int = 16,
+        mlp_ratio: float = 8.0 / 3.0,
+        ffn_multiple_of: int = 256,
+        timestep_embed_dim: int = 256,
+        dropout: float = 0.0,
+        attn_dropout: float = 0.0,
+        context_layer_norm: bool = True,
+        qk_norm: bool = True,
+        qk_norm_eps: float = 1e-6,
+        rope: bool = True,
+        causal_attn: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_action_horizon = max_action_horizon
+        self.max_action_dim = max_action_dim
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.ffn_multiple_of = ffn_multiple_of
+        self.timestep_embed_dim = timestep_embed_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.context_layer_norm = context_layer_norm
+        self.qk_norm = qk_norm
+        self.qk_norm_eps = qk_norm_eps
+        self.rope = rope
+        self.causal_attn = causal_attn
+    def to_dict(self):
+        output = super().to_dict()
+        # These are derived from the parent MolmoAct2Config for HF exports. Keeping
+        # them out of the public nested config avoids duplicated sources of truth.
+        output.pop("max_action_horizon", None)
+        output.pop("max_action_dim", None)
+        return output
+class MolmoAct2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2ForConditionalGeneration`].
+    It is used to instantiate an MolmoAct2 model according to the specified arguments, defining the model architecture.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2Config, MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2TextConfig
+    >>> # Initializing a MolmoAct2VitConfig
+    >>> vit_config = MolmoAct2VitConfig()
+    >>> # Initializing a MolmoAct2AdapterConfig
+    >>> adapter_config = MolmoAct2AdapterConfig()
+    >>> # Initializing a MolmoAct2TextConfig
+    >>> text_config = MolmoAct2TextConfig()
+    >>> # Initializing a MolmoAct2Config
+    >>> configuration = MolmoAct2Config(
+    >>>     vit_config=vit_config,
+    >>>     adapter_config=adapter_config,
+    >>>     text_config=text_config,
+    >>>     image_start_token_id=151936,
+    >>>     image_end_token_id=151937,
+    >>>     image_patch_id=151938,
+    >>>     image_col_id=151939,
+    >>>     low_res_image_start_token_id=151940,
+    >>>     image_low_res_id=151942,
+    >>>     frame_start_token_id=151943,
+    >>>     frame_end_token_id=151944,
+    >>> )
+    >>> # Initializing a model
+    >>> model = MolmoAct2ForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact2"
+    sub_configs = {
+        "text_config": MolmoAct2TextConfig,
+        "vit_config": MolmoAct2VitConfig,
+        "adapter_config": MolmoAct2AdapterConfig,
+        "action_expert_config": MolmoAct2ActionExpertConfig,
+    }
+    def __init__(
+        self,
+        vit_config: MolmoAct2VitConfig = None,
+        adapter_config: MolmoAct2AdapterConfig = None,
+        text_config: MolmoAct2TextConfig = None,
+        action_expert_config: MolmoAct2ActionExpertConfig = None,
+        image_start_token_id: int = None,
+        low_res_image_start_token_id: int = None,
+        image_end_token_id: int = None,
+        image_low_res_id: int = None,
+        image_patch_id: int = None,
+        image_col_id: int = None,
+        frame_start_token_id: int = None,
+        frame_end_token_id: int = None,
+        use_frame_special_tokens: bool = True,
+        initializer_range: float = 0.02,
+        add_action_expert: bool = True,
+        max_action_dim: int = 32,
+        max_action_horizon: int = 30,
+        n_obs_steps: int = 30,
+        action_mode: str = "both",
+        state_format: str = "discrete",
+        flow_matching_num_steps: int = 10,
+        flow_matching_cutoff: float = 1.0,
+        flow_matching_time_offset: float = 0.001,
+        flow_matching_time_scale: float = 0.999,
+        flow_matching_beta_alpha: float = 1.0,
+        flow_matching_beta_beta: float = 1.5,
+        mask_action_dim_padding: bool = True,
+        enable_depth_reasoning: bool = False,
+        depth_mode: int = 2,
+        num_depth_codes: int = 100,
+        action_expert_depth_gate: bool = False,
+        action_expert_depth_gate_per_layer: bool = False,
+        action_expert_depth_gate_init_bias: float = -4.0,
+        action_output_token_id: int = None,
+        action_start_token_id: int = None,
+        action_end_token_id: int = None,
+        action_token_start_id: int = None,
+        num_action_tokens: int = 0,
+        depth_output_token_id: int = None,
+        depth_start_token_id: int = None,
+        depth_end_token_id: int = None,
+        depth_token_start_id: int = None,
+        num_depth_tokens: int = 0,
+        state_start_token_id: int = None,
+        state_end_token_id: int = None,
+        state_token_start_id: int = None,
+        num_state_tokens: int = 0,
+        add_setup_tokens: bool = True,
+        add_control_tokens: bool = True,
+        norm_stats_filename: str = "norm_stats.json",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if vit_config is None:
+            self.vit_config = MolmoAct2VitConfig()
+        elif isinstance(vit_config, dict):
+            self.vit_config = MolmoAct2VitConfig(**vit_config)
+        else:
+            self.vit_config = vit_config
+        if adapter_config is None:
+            self.adapter_config = MolmoAct2AdapterConfig()
+        elif isinstance(adapter_config, dict):
+            self.adapter_config = MolmoAct2AdapterConfig(**adapter_config)
+        else:
+            self.adapter_config = adapter_config
+        if text_config is None:
+            self.text_config = MolmoAct2TextConfig()
+        elif isinstance(text_config, dict):
+            self.text_config = MolmoAct2TextConfig(**text_config)
+        else:
+            self.text_config = text_config
+        self.add_action_expert = bool(add_action_expert)
+        if not self.add_action_expert:
+            self.action_expert_config = None
+        elif action_expert_config is None:
+            self.action_expert_config = MolmoAct2ActionExpertConfig(
+                max_action_horizon=max_action_horizon,
+                max_action_dim=max_action_dim,
+                num_layers=self.text_config.num_hidden_layers,
+            )
+        elif isinstance(action_expert_config, dict):
+            self.action_expert_config = MolmoAct2ActionExpertConfig(**action_expert_config)
+        else:
+            self.action_expert_config = action_expert_config
+        if self.add_action_expert:
+            self.action_expert_config.max_action_dim = int(max_action_dim)
+            self.action_expert_config.max_action_horizon = int(max_action_horizon)
+            self._validate_release_action_config(
+                state_format=state_format,
+            )
+        self.image_start_token_id = image_start_token_id
+        self.low_res_image_start_token_id = low_res_image_start_token_id
+        self.image_end_token_id = image_end_token_id
+        self.image_low_res_id = image_low_res_id
+        self.image_high_res_id = image_patch_id
+        self.image_patch_id = image_patch_id
+        self.image_col_id = image_col_id
+        self.frame_start_token_id = frame_start_token_id
+        self.frame_end_token_id = frame_end_token_id
+        self.use_frame_special_tokens = use_frame_special_tokens
+        self.initializer_range = initializer_range
+        self.max_action_dim = max_action_dim
+        self.max_action_horizon = max_action_horizon
+        self.n_obs_steps = n_obs_steps
+        self.action_mode = action_mode
+        self.state_format = state_format
+        self.flow_matching_num_steps = flow_matching_num_steps
+        self.flow_matching_cutoff = flow_matching_cutoff
+        self.flow_matching_time_offset = flow_matching_time_offset
+        self.flow_matching_time_scale = flow_matching_time_scale
+        self.flow_matching_beta_alpha = flow_matching_beta_alpha
+        self.flow_matching_beta_beta = flow_matching_beta_beta
+        self.mask_action_dim_padding = mask_action_dim_padding
+        self.enable_depth_reasoning = enable_depth_reasoning
+        self.depth_mode = depth_mode
+        self.num_depth_codes = num_depth_codes
+        self.action_expert_depth_gate = action_expert_depth_gate
+        self.action_expert_depth_gate_per_layer = action_expert_depth_gate_per_layer
+        self.action_expert_depth_gate_init_bias = action_expert_depth_gate_init_bias
+        self.action_output_token_id = action_output_token_id
+        self.action_start_token_id = action_start_token_id
+        self.action_end_token_id = action_end_token_id
+        self.action_token_start_id = action_token_start_id
+        self.num_action_tokens = num_action_tokens
+        self.depth_output_token_id = depth_output_token_id
+        self.depth_start_token_id = depth_start_token_id
+        self.depth_end_token_id = depth_end_token_id
+        self.depth_token_start_id = depth_token_start_id
+        self.num_depth_tokens = num_depth_tokens
+        self.state_start_token_id = state_start_token_id
+        self.state_end_token_id = state_end_token_id
+        self.state_token_start_id = state_token_start_id
+        self.num_state_tokens = num_state_tokens
+        self.add_setup_tokens = add_setup_tokens
+        self.add_control_tokens = add_control_tokens
+        self.norm_stats_filename = norm_stats_filename
+    @staticmethod
+    def _validate_release_action_config(
+        *,
+        state_format: str,
+    ) -> None:
+        if state_format != "discrete":
+            raise ValueError("MolmoAct2 HF export supports only state_format='discrete'.")
+    @property
+    def image_num_patch(self):
+        assert self.vit_config is not None
+        return self.vit_config.image_num_patch
+    @property
+    def num_attention_heads(self):
+        return self.text_config.num_attention_heads
+    @property
+    def num_key_value_heads(self):
+        return self.text_config.num_key_value_heads
+    @property
+    def head_dim(self):
+        return self.text_config.head_dim
+    @property
+    def num_hidden_layers(self):
+        return self.text_config.num_hidden_layers
+    @property
+    def hidden_size(self):
+        return self.text_config.hidden_size
+    @property
+    def vocab_size(self):
+        return self.text_config.vocab_size
+    @property
+    def max_position_embeddings(self):
+        return self.text_config.max_position_embeddings
+MolmoAct2VitConfig.register_for_auto_class()
+MolmoAct2AdapterConfig.register_for_auto_class()
+MolmoAct2TextConfig.register_for_auto_class()
+MolmoAct2ActionExpertConfig.register_for_auto_class()
+MolmoAct2Config.register_for_auto_class()

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token_id": 151645,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "5.3.0"
+}

image_processing_molmoact2.py ADDED Viewed

	@@ -0,0 +1,546 @@

+"""Image processor class for MolmoAct2"""
+from typing import Optional, Union
+import numpy as np
+import einops
+import torch
+import torchvision.transforms
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    make_flat_list_of_images,
+    valid_images,
+    to_numpy_array,
+)
+from transformers.image_transforms import convert_to_rgb
+from transformers.processing_utils import ImagesKwargs
+from transformers.image_processing_utils import BaseImageProcessor, get_size_dict
+from transformers.utils import logging
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.utils import TensorType, logging
+logger = logging.get_logger(__name__)
+def normalize_image(
+    image: np.ndarray,
+    image_mean: list[float],
+    image_std: list[float],
+) -> np.ndarray:
+    if np.allclose(image_mean, [0.5, 0.5, 0.5]) and np.allclose(image_std, [0.5, 0.5, 0.5]):
+        return image * np.asarray(2.0, dtype=np.float32) - np.asarray(1.0, dtype=np.float32)
+    image -= np.array(image_mean, dtype=np.float32)[None, None, :]
+    image /= np.array(image_std, dtype=np.float32)[None, None, :]
+    return image
+def resize_image(
+    image: np.ndarray,
+    desired_output_size: list[int],
+    resample: PILImageResampling,
+) -> np.ndarray:
+    image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+    dtype = image.dtype
+    if torch.is_floating_point(image):
+        in_min = 0.0
+        in_max = 1.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0.0, 1.0).to(dtype)
+    else:
+        assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(image.dtype)
+        in_min = 0.0
+        in_max = 255.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0, 255).to(dtype)
+    resized = resized.to(torch.float32)
+    resized = (resized - in_min) / (in_max - in_min)
+    resized = torch.permute(resized, [1, 2, 0]).numpy()
+    return resized
+def select_tiling(h, w, patch_size, max_num_crops):
+    """Divide in image of size [w, h] in up to max_num_patches of size patch_size"""
+    original_size = np.stack([h, w])  # [1, 2]
+    original_res = h * w
+    tilings = []
+    for i in range(1, max_num_crops + 1):
+        for j in range(1, max_num_crops + 1):
+            if i*j <= max_num_crops:
+                tilings.append((i, j))
+    # sort so argmin and argmax favour smaller tilings in the event of a tie
+    tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
+    candidate_tilings = np.array(tilings, dtype=np.int32)  # [n_resolutions, 2]
+    candidate_resolutions = candidate_tilings * patch_size  # [n_resolutions, 2]
+    # How much we would need to scale the image to fit exactly in each tiling
+    original_size = np.stack([h, w], dtype=np.float32)  # [1, 2]
+    # The original size can be zero in rare cases if the image is smaller than the margin
+    # In those cases letting the scale become infinite means the tiling is based on the
+    # other side, or falls back to the smallest tiling
+    with np.errstate(divide='ignore'):
+        required_scale_d = candidate_resolutions.astype(np.float32) / original_size,
+    required_scale = np.min(required_scale_d, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if np.all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        ix = np.argmax(required_scale)
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+        ix = np.argmin(required_scale)
+    return candidate_tilings[ix]
+def build_resized_image(
+    image: np.ndarray,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    resized = resize_image(
+        image, base_image_input_size, resample,
+    )
+    resized = normalize_image(resized, image_mean, image_std)
+    if len(resized.shape) == 3:
+        resized = np.expand_dims(resized, 0)
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    resize_idx = np.arange(crop_patch_w*crop_patch_h).reshape([crop_patch_h, crop_patch_w])
+    return resized, resize_idx
+def build_overlapping_crops(
+    image: np.ndarray,
+    max_crops: int,
+    overlap_margins: list[int],
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Decompose an image into a set of overlapping crops
+    :return crop_arr: [n_crops, h, w, 3] The crops
+    :return patch_idx: [overlap_patch_h, overlap_patch_w] For each patch in the resized image
+                        the crops were extracted from, what patch in `crop_arr` it corresponds to
+    """
+    original_image_h, original_image_w = image.shape[:2]
+    crop_size = base_image_input_size[0]
+    assert base_image_input_size[0] == base_image_input_size[1]
+    left_margin, right_margin = overlap_margins
+    total_margin_pixels = image_patch_size * (right_margin + left_margin)  # pixels removed per dim
+    crop_patches = base_image_input_size[0] // image_patch_size  # patches per crop dim
+    crop_window_patches = crop_patches - (right_margin + left_margin)  # usable patches
+    crop_window_size = crop_window_patches * image_patch_size
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    original_image_h, original_image_w = image.shape[:2]
+    crop_size = base_image_input_size[0]
+    # Decide how to tile the image, to account for the overlap margins we compute the tiling
+    # as if we had an image without the margins and were using a crop size without the margins
+    tiling = select_tiling(
+        original_image_h - total_margin_pixels,
+        original_image_w - total_margin_pixels,
+        crop_window_size,
+        max_crops,
+    )
+    src = resize_image(
+        image,
+        [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels],
+        resample,
+    )
+    src = normalize_image(src, image_mean, image_std)
+    # Now we have to split the image into crops, and track what patches came from
+    # where in `patch_idx_arr`
+    n_crops = tiling[0] * tiling[1]
+    crop_arr = np.zeros([n_crops, crop_size, crop_size, 3], dtype=src.dtype)
+    patch_idx_arr = np.zeros([n_crops, crop_patch_h, crop_patch_w], dtype=np.int32)
+    on_crop = 0
+    for i in range(tiling[0]):
+        # Slide over `src` by `crop_window_size` steps, but extract crops of size `crops_size`
+        # which results in overlapping crop windows
+        y0 = i*crop_window_size
+        for j in range(tiling[1]):
+            x0 = j*crop_window_size
+            crop_arr[on_crop] = src[y0:y0+crop_size, x0:x0+crop_size]
+            patch_idx = np.arange(crop_patch_w*crop_patch_h).reshape(crop_patch_h, crop_patch_w)
+            patch_idx += on_crop * crop_patch_h * crop_patch_w
+            # Mask out idx that are in the overlap region
+            if i != 0:
+                patch_idx[:left_margin, :] = -1
+            if j != 0:
+                patch_idx[:, :left_margin] = -1
+            if i != tiling[0]-1:
+                patch_idx[-right_margin:, :] = -1
+            if j != tiling[1]-1:
+                patch_idx[:, -right_margin:] = -1
+            patch_idx_arr[on_crop] = patch_idx
+            on_crop += 1
+    # `patch_idx_arr` is ordered crop-by-crop, here we transpose `patch_idx_arr`
+    # so it is ordered left-to-right order
+    patch_idx_arr = np.reshape(
+        patch_idx_arr,
+        [tiling[0], tiling[1], crop_patch_h, crop_patch_w]
+    )
+    patch_idx_arr = np.transpose(patch_idx_arr, [0, 2, 1, 3])
+    patch_idx_arr = np.reshape(patch_idx_arr, [-1])
+    # Now get the parts not in the overlap region, so it should map each patch in `src`
+    # to the correct patch it should come from in `crop_arr`
+    patch_idx_arr = patch_idx_arr[patch_idx_arr >= 0].reshape(
+        src.shape[0]//image_patch_size,
+        src.shape[1]//image_patch_size,
+    )
+    return crop_arr, patch_idx_arr
+def batch_pixels_to_patches(array: np.ndarray, patch_size: int) -> np.ndarray:
+    """Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
+    if len(array.shape) == 3:
+        n_crops, h, w = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
+        array = np.transpose(array, [0, 1, 3, 2, 4])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size])
+        return array
+    else:
+        n_crops, h, w, c = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
+        array = np.transpose(array, [0, 1, 3, 2, 4, 5])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size*c])
+        return array
+def arange_for_pooling(
+    idx_arr: np.ndarray,
+    pool_h: int,
+    pool_w: int,
+) -> np.ndarray:
+    h_pad = pool_h * ((idx_arr.shape[0] + pool_h - 1) // pool_h) - idx_arr.shape[0]
+    w_pad = pool_w * ((idx_arr.shape[1] + pool_w - 1) // pool_w) - idx_arr.shape[1]
+    idx_arr = np.pad(idx_arr, [[h_pad//2, (h_pad+1)//2], [w_pad//2, (w_pad+1)//2]],
+                     mode='constant',constant_values=-1)
+    return einops.rearrange(
+        idx_arr, "(h dh) (w dw) -> h w (dh dw)", dh=pool_h, dw=pool_w)
+def image_to_patches_and_grids(
+    image: np.ndarray,
+    max_crops: int,
+    overlap_margins: list[int],
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+    image_pooling_w: int,
+    image_pooling_h: int,
+    crop_mode: str = "overlap-and-resize-c2",
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    :return image_grids, the shape of each (low-res, high-res) image after pooling
+    :return crops, the image crops to processes with the ViT
+    :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
+                                patches in `crops` to pool for that token, masked with -1
+    """
+    if isinstance(base_image_input_size, int):
+        base_image_input_size = (base_image_input_size, base_image_input_size)
+    base_image_input_d = image_patch_size
+    pooling_w = image_pooling_w
+    pooling_h = image_pooling_h
+    crop_patch_w = base_image_input_size[1] // base_image_input_d
+    crop_patch_h = base_image_input_size[0] // base_image_input_d
+    if crop_mode == "resize":
+        resized, resize_idx = build_resized_image(
+            image,
+            base_image_input_size,
+            resample,
+            image_mean,
+            image_std,
+            image_patch_size,
+        )
+        resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+        resized_h, resized_w = resize_idx.shape[:2]
+        resize_idx = resize_idx.reshape([-1, pooling_h * pooling_w])
+        image_grid = [np.array([resized_h, resized_w, 0, 0])]
+        return (
+            np.stack(image_grid, 0),
+            batch_pixels_to_patches(resized, image_patch_size),
+            resize_idx,
+        )
+    if crop_mode not in {"overlap-and-resize-c2", "overlap-and-resize"}:
+        raise ValueError(f"Unsupported MolmoAct2 image crop_mode {crop_mode!r}.")
+    crop_arr, patch_idx_arr = build_overlapping_crops(
+        image,
+        max_crops,
+        overlap_margins,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    pooling_idx = arange_for_pooling(patch_idx_arr, pooling_h, pooling_w)
+    h, w = pooling_idx.shape[:2]
+    pooling_idx = pooling_idx.reshape([-1, pooling_h*pooling_w])
+    # Finally do the same for the global image
+    resized, resize_idx = build_resized_image(
+        image,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    crop_arr = np.concatenate([resized, crop_arr], 0)
+    resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+    resized_h, resized_w = resize_idx.shape[:2]
+    resize_idx = resize_idx.reshape([-1, pooling_h*pooling_w])
+    # Global image goes first, so the order of patches in previous crops gets increased
+    pooling_idx = np.where(
+        pooling_idx >= 0,
+        pooling_idx + crop_patch_h*crop_patch_w,
+        -1
+    )
+    pooling_idx = np.concatenate([resize_idx, pooling_idx])
+    image_grid = [np.array([resized_h, resized_w, h, w])]
+    return (
+        np.stack(image_grid, 0),
+        batch_pixels_to_patches(crop_arr, image_patch_size),
+        pooling_idx
+    )
+class MolmoAct2ImagesKwargs(ImagesKwargs, total=False):
+    max_crops: Optional[int]
+    overlap_margins: Optional[list[int]]
+    crop_mode: Optional[str]
+    patch_size: Optional[int]
+    pooling_size: Optional[list[int]]
+class MolmoAct2ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a MolmoAct2 image processor that preprocesses images for the model.
+    Args:
+        size (`dict[str, int]` *optional*, defaults to `{"height": 378, "width": 378}`):
+            Size of the image after resizing.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use when resizing the image.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        max_crops (`int`, *optional*, defaults to `8`):
+            Maximum number of crops to use per image.
+        overlap_margins (`list[int]`, *optional*, defaults to `[4, 4]`):
+            Overlap margins to use.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spatial patch size of the vision encoder.
+        pooling_size (`list[int]`, *optional*, defaults to `[2, 2]`):
+            The pooling size of the vision adapter.
+    """
+    model_input_names = ["pixel_values", "image_token_pooling", "image_grids", "image_num_crops"]
+    def __init__(
+        self,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        max_crops: int = 8,
+        overlap_margins: list[int] = [4, 4],
+        crop_mode: str = "overlap-and-resize-c2",
+        patch_size: int = 14,
+        pooling_size: list[int] = [2, 2],
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 378, "width": 378}
+        size = get_size_dict(size, default_to_square=True)
+        self.size = size
+        self.resample = resample
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_convert_rgb = do_convert_rgb
+        self.max_crops = max_crops
+        self.overlap_margins = overlap_margins
+        self.crop_mode = crop_mode
+        self.patch_size = patch_size
+        self.pooling_size = pooling_size
+    def preprocess(
+        self,
+        images: ImageInput,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        max_crops: Optional[int] = None,
+        overlap_margins: Optional[list[int]] = None,
+        crop_mode: Optional[str] = None,
+        patch_size: Optional[int] = None,
+        pooling_size: Optional[list[int]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use when resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            max_crops (`int`, *optional*, defaults to `self.max_crops`):
+                Maximum number of crops to use per image.
+            overlap_margins (`list[int]`, *optional*, defaults to `self.overlap_margins`):
+                Overlap margins to use.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            pooling_size (`list[int]`, *optional*, defaults to `self.pooling_size`):
+                The pooling size of the vision adapter.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+        Returns:
+            A `BatchFeature` containing the following keys:
+                - `pixel_values`: The preprocessed images.
+                - `image_token_pooling`: The indices of the patches in `crops` to pool for each token in `image_tokens`.
+                - `image_grids`: The image grids.
+                - `image_num_crops`: The number of crops for each image.
+        """
+        if size is not None:
+            if "height" not in size or "width" not in size:
+                raise ValueError("size must contain 'height' and 'width' keys.")
+        else:
+            size = {**self.size}
+        base_image_input_size = [size["height"], size["width"]]
+        resample = resample or self.resample
+        image_mean = image_mean or self.image_mean
+        image_std = image_std or self.image_std
+        do_convert_rgb = do_convert_rgb or self.do_convert_rgb
+        max_crops = max_crops or self.max_crops
+        overlap_margins = overlap_margins or self.overlap_margins
+        crop_mode = crop_mode or self.crop_mode
+        patch_size = patch_size or self.patch_size
+        pooling_size = pooling_size or self.pooling_size
+        image_pooling_h, image_pooling_w = pooling_size
+        if images is not None:
+            images = self.fetch_images(images)
+            images = make_flat_list_of_images(images)
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        data = {}
+        if images is not None:
+            batch_grids = []
+            batch_crops = []
+            batch_pooled_patches_idx = []
+            batch_num_crops = []
+            for image in images:
+                image_grid, crops, pooled_idx = image_to_patches_and_grids(
+                    image,
+                    max_crops,
+                    overlap_margins,
+                    base_image_input_size,
+                    resample,
+                    image_mean,
+                    image_std,
+                    patch_size,
+                    image_pooling_w,
+                    image_pooling_h,
+                    crop_mode,
+                )
+                batch_grids.append(image_grid)
+                batch_crops.append(crops)
+                batch_pooled_patches_idx.append(pooled_idx)
+                batch_num_crops.append(crops.shape[0])
+            pixel_values = np.concatenate(batch_crops, 0)
+            image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
+            image_grids = np.concatenate(batch_grids, 0)
+            image_num_crops = np.array(batch_num_crops)
+            data.update(
+                pixel_values=pixel_values,
+                image_token_pooling=image_token_pooling,
+                image_grids=image_grids,
+                image_num_crops=image_num_crops,
+            )
+        return BatchFeature(data, tensor_type=return_tensors)
+MolmoAct2ImageProcessor.register_for_auto_class()

inference.py ADDED Viewed

	@@ -0,0 +1,768 @@

+"""Inference utilities for MolmoAct2"""
+from dataclasses import dataclass
+from typing import Any, Iterable, Optional, Sequence, Tuple
+import torch
+from torch.nn import functional as F
+from transformers.cache_utils import Cache
+from transformers.configuration_utils import PretrainedConfig
+@dataclass
+class _ActionFlowInputs:
+    trajectory: torch.Tensor
+    context: Any
+    modulations: Sequence[Any]
+    action_dim_is_pad: Optional[torch.Tensor]
+@dataclass
+class _ActionFlowCudaGraph:
+    key: Tuple[Any, ...]
+    graph: torch.cuda.CUDAGraph
+    static_inputs: _ActionFlowInputs
+    output: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraphLayerStage:
+    residual: torch.Tensor
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraphPostStage:
+    graph: torch.cuda.CUDAGraph
+    attn_context: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraph:
+    cache_key: Tuple[Any, ...]
+    pre_graph: torch.cuda.CUDAGraph
+    token_ids: torch.Tensor
+    cos: torch.Tensor
+    sin: torch.Tensor
+    positions: torch.Tensor
+    stages: Sequence[_DepthDecodeCudaGraphLayerStage]
+    post_graphs: Sequence[_DepthDecodeCudaGraphPostStage]
+    output: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraphSpec:
+    eligible: bool
+    cache_key_prefix: Tuple[Any, ...]
+    num_hidden_layers: int
+    head_dim: int
+    num_attention_heads: int
+def _cache_seq_len_int(past_key_values: Optional[Cache]) -> int:
+    if past_key_values is None:
+        return 0
+    seq_len = past_key_values.get_seq_length()
+    if torch.is_tensor(seq_len):
+        return int(seq_len.item())
+    return int(seq_len)
+def _cache_max_len_int(past_key_values: Optional[Cache]) -> int:
+    if past_key_values is None:
+        return -1
+    max_len = past_key_values.get_max_cache_shape()
+    if torch.is_tensor(max_len):
+        return int(max_len.item())
+    return int(max_len)
+def _iter_cache_key_values(
+    past_key_values: Cache,
+) -> Iterable[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]]:
+    layers = getattr(past_key_values, "layers", None)
+    if layers is not None:
+        for layer in layers:
+            yield getattr(layer, "keys", None), getattr(layer, "values", None)
+        return
+    for layer in past_key_values:
+        yield layer[0], layer[1]
+class _DepthDecodeStaticLayerCache:
+    is_compileable = False
+    is_sliding = False
+    def __init__(self, max_cache_len: int) -> None:
+        self.max_cache_len = int(max_cache_len)
+        self.cumulative_length = 0
+        self.keys: Optional[torch.Tensor] = None
+        self.values: Optional[torch.Tensor] = None
+    def _allocate(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
+        bsz, n_heads = key_states.shape[:2]
+        self.keys = torch.empty(
+            (bsz, n_heads, self.max_cache_len, key_states.shape[-1]),
+            dtype=key_states.dtype,
+            device=key_states.device,
+        )
+        self.values = torch.empty(
+            (bsz, n_heads, self.max_cache_len, value_states.shape[-1]),
+            dtype=value_states.dtype,
+            device=value_states.device,
+        )
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        *args,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.keys is None:
+            self._allocate(key_states, value_states)
+        start = self.cumulative_length
+        end = start + key_states.shape[-2]
+        if end > self.max_cache_len:
+            raise RuntimeError(
+                f"KV cache length {end} exceeds max_cache_len={self.max_cache_len}."
+            )
+        self.keys[:, :, start:end, :].copy_(key_states)
+        self.values[:, :, start:end, :].copy_(value_states)
+        self.cumulative_length = end
+        return self.keys[:, :, :end, :], self.values[:, :, :end, :]
+    def get_seq_length(self) -> int:
+        return self.cumulative_length
+    def get_max_cache_shape(self) -> int:
+        return -1
+    def reset(self) -> None:
+        self.cumulative_length = 0
+class _DepthDecodeStaticCache(Cache):
+    def __init__(self, config: PretrainedConfig, max_cache_len: int) -> None:
+        text_config = config.get_text_config(decoder=True)
+        super().__init__(
+            layers=[
+                _DepthDecodeStaticLayerCache(max_cache_len=max_cache_len)
+                for _ in range(text_config.num_hidden_layers)
+            ]
+        )
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        return self.layers[layer_idx].get_seq_length()
+    def get_max_cache_shape(self, layer_idx: int = 0) -> int:
+        return self.layers[layer_idx].get_max_cache_shape()
+    def reset(self) -> None:
+        for layer in self.layers:
+            layer.reset()
+class ActionCudaGraphManager:
+    def __init__(self, model: Any) -> None:
+        self.model = model
+        self.enabled = True
+        self.action_flow_graph: Optional[_ActionFlowCudaGraph] = None
+    def set_enabled(self, enabled: bool) -> None:
+        self.enabled = bool(enabled)
+    def can_use_action_flow(self, inputs: _ActionFlowInputs) -> bool:
+        action_model = self.model
+        if not self.enabled:
+            return False
+        if action_model.training or action_model._require_action_expert().training:
+            return False
+        if inputs.trajectory.device.type != "cuda":
+            return False
+        def all_on_cuda():
+            yield inputs.trajectory
+            for k, v in inputs.context.kv_contexts:
+                yield k
+                yield v
+            for t in (
+                inputs.context.cross_mask,
+                inputs.context.self_mask,
+                inputs.context.valid_action,
+                inputs.action_dim_is_pad,
+            ):
+                if t is not None:
+                    yield t
+            if inputs.context.rope_cache is not None:
+                yield from inputs.context.rope_cache
+            for step in inputs.modulations:
+                yield step.conditioning
+                for block_modulation in step.block_modulations:
+                    yield from block_modulation
+                yield from step.final_modulation
+        return all(t.device.type == "cuda" for t in all_on_cuda())
+    def run_action_flow(
+        self,
+        inputs: _ActionFlowInputs,
+        steps: int,
+        run_loop,
+    ) -> torch.Tensor:
+        key = _cuda_graph_key(inputs, steps)
+        cache = self.action_flow_graph
+        if cache is None or cache.key != key:
+            static_inputs = _clone_static_inputs(inputs)
+            graph, output = _capture_cuda_graph(
+                lambda: run_loop(static_inputs, steps),
+                inputs.trajectory.device,
+                after_warmup=lambda: static_inputs.trajectory.copy_(inputs.trajectory),
+            )
+            cache = _ActionFlowCudaGraph(
+                key=key,
+                graph=graph,
+                static_inputs=static_inputs,
+                output=output,
+            )
+            self.action_flow_graph = cache
+        else:
+            _copy_inputs_(cache.static_inputs, inputs)
+        cache.graph.replay()
+        return cache.output.clone()
+class DepthDecodeCudaGraphManager:
+    def __init__(self, model: Any) -> None:
+        self.model = model
+        self.backbone = model.model
+        self.enabled = True
+        self.graph: Optional[_DepthDecodeCudaGraph] = None
+        self.graph_spec: Optional[_DepthDecodeCudaGraphSpec] = None
+    def set_enabled(self, enabled: bool) -> None:
+        self.enabled = bool(enabled)
+    def make_static_cache(self, max_cache_len: int) -> _DepthDecodeStaticCache:
+        return _DepthDecodeStaticCache(
+            config=self.model.config.text_config,
+            max_cache_len=max_cache_len,
+        )
+    def _depth_decode_spec(self) -> _DepthDecodeCudaGraphSpec:
+        static = self.graph_spec
+        if static is None:
+            cfg = self.backbone.transformer.config
+            rotary_emb = getattr(self.backbone.transformer, "rotary_emb", None)
+            static = _DepthDecodeCudaGraphSpec(
+                eligible=(
+                    not cfg.norm_after
+                    and cfg.rope_scaling_layers is None
+                    and getattr(rotary_emb, "rope_type", None) == "default"
+                    and cfg._attn_implementation == "sdpa"
+                ),
+                cache_key_prefix=(
+                    cfg.hidden_size,
+                    cfg.num_attention_heads,
+                    cfg.num_key_value_heads,
+                    cfg.head_dim,
+                    cfg.num_hidden_layers,
+                    cfg.use_qk_norm,
+                    cfg.qk_norm_type,
+                    cfg._attn_implementation,
+                ),
+                num_hidden_layers=cfg.num_hidden_layers,
+                head_dim=cfg.head_dim,
+                num_attention_heads=cfg.num_attention_heads,
+            )
+            self.graph_spec = static
+        return static
+    def can_use(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+    ) -> bool:
+        if (
+            not self.enabled
+            or self.model.training
+            or self.backbone.transformer.training
+        ):
+            return False
+        if next_input_ids.device.type != "cuda":
+            return False
+        if (
+            next_input_ids.ndim != 2
+            or next_input_ids.shape[0] != 1
+            or next_input_ids.shape[1] != 1
+        ):
+            return False
+        if not isinstance(past_key_values, _DepthDecodeStaticCache):
+            return False
+        if (
+            not torch.is_tensor(attention_bias)
+            or attention_bias.device != next_input_ids.device
+        ):
+            return False
+        return self._depth_decode_spec().eligible
+    def _depth_decode_key(
+        self,
+        next_input_ids: torch.Tensor,
+        attention_bias: torch.Tensor,
+    ) -> Tuple[Any, ...]:
+        device = next_input_ids.device
+        return (
+            self._depth_decode_spec().cache_key_prefix,
+            device.type,
+            device.index,
+            self.model.lm_head.weight.dtype,
+            attention_bias.shape[-1],
+        )
+    def _select_depth_decode_rope(
+        self, cos: torch.Tensor, sin: torch.Tensor, *, past_length: int
+    ) -> None:
+        emb = self.backbone.transformer.rotary_emb
+        cos.copy_(emb._pos_cos_cache[0, :, past_length : past_length + 1, :])
+        sin.copy_(emb._pos_sin_cache[0, :, past_length : past_length + 1, :])
+    def _depth_decode_pre_layer(
+        self,
+        layer_idx: int,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        block = self.backbone.transformer.blocks[layer_idx]
+        attention = block.self_attn
+        residual = hidden_states
+        hidden_states = block.attn_norm(hidden_states)
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, attention.head_dim)
+        qkv = attention.att_proj(hidden_states)
+        query_states, key_states, value_states = qkv.split(attention.fused_dims, dim=-1)
+        value_states = value_states.view(hidden_shape)
+        apply_qk_norm = attention.q_norm is not None and attention.k_norm is not None
+        norm_after_view = apply_qk_norm and attention.qk_norm_type == "qwen3"
+        if apply_qk_norm and not norm_after_view:
+            query_states = attention.q_norm(query_states)
+            key_states = attention.k_norm(key_states)
+        query_states = query_states.view(hidden_shape)
+        key_states = key_states.view(hidden_shape)
+        if norm_after_view:
+            query_states = attention.q_norm(query_states)
+            key_states = attention.k_norm(key_states)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        query_states, key_states = _apply_rotary_pos_emb(
+            query_states, key_states, cos, sin
+        )
+        return residual, query_states, key_states, value_states
+    def _depth_decode_pre0(
+        self,
+        token_ids: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        inputs_embeds = self.model._embed_base_tokens(token_ids)
+        return self._depth_decode_pre_layer(0, inputs_embeds, cos, sin)
+    def _depth_decode_post_layer(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+    ) -> torch.Tensor:
+        block = self.backbone.transformer.blocks[layer_idx]
+        attention = block.self_attn
+        input_shape = residual.shape[:-1]
+        attn_output = attn_context.reshape(*input_shape, -1).contiguous()
+        attn_output = attention.attn_out(attn_output)
+        hidden_states = residual + block.dropout(attn_output)
+        residual = hidden_states
+        hidden_states = block.ff_norm(hidden_states)
+        hidden_states = block.mlp(hidden_states)
+        hidden_states = residual + block.dropout(hidden_states)
+        return hidden_states
+    def _depth_decode_post_and_pre_next(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
+        return self._depth_decode_pre_layer(layer_idx + 1, hidden_states, cos, sin)
+    def _depth_decode_last_post(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
+        return self.backbone.transformer.ln_f(hidden_states)
+    def _build_depth_decode_graph(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_length: int,
+        attention_bias: torch.Tensor,
+    ) -> _DepthDecodeCudaGraph:
+        text_config = self.backbone.transformer.config
+        device = next_input_ids.device
+        dtype = self.model.lm_head.weight.dtype
+        static = self._depth_decode_spec()
+        num_layers = static.num_hidden_layers
+        head_dim = static.head_dim
+        max_cache_len = int(attention_bias.shape[-1])
+        max_rope_len = max(int(text_config.max_position_embeddings or 0), max_cache_len)
+        self.backbone.transformer.prepare_rope_cache(
+            device=device, max_seq_len=max_rope_len
+        )
+        token_ids = torch.empty((1, 1), device=device, dtype=torch.long)
+        cos = torch.empty((1, 1, head_dim), device=device, dtype=dtype)
+        sin = torch.empty_like(cos)
+        positions = torch.arange(max_cache_len, device=device, dtype=torch.long)
+        context_shape = (1, 1, static.num_attention_heads, head_dim)
+        token_ids.copy_(next_input_ids)
+        self._select_depth_decode_rope(cos, sin, past_length=past_length)
+        pre_graph, pre_output = _capture_cuda_graph(
+            lambda: self._depth_decode_pre0(token_ids, cos, sin),
+            device,
+        )
+        stages = [_DepthDecodeCudaGraphLayerStage(*pre_output)]
+        post_graphs = []
+        for layer_idx in range(num_layers - 1):
+            stage = stages[-1]
+            attn_context = torch.empty(context_shape, device=device, dtype=dtype)
+            graph, output = _capture_cuda_graph(
+                lambda layer_idx=layer_idx, stage=stage, attn_context=attn_context: (
+                    self._depth_decode_post_and_pre_next(
+                        layer_idx,
+                        stage.residual,
+                        attn_context,
+                        cos,
+                        sin,
+                    )
+                ),
+                device,
+            )
+            post_graphs.append(
+                _DepthDecodeCudaGraphPostStage(graph=graph, attn_context=attn_context)
+            )
+            stages.append(_DepthDecodeCudaGraphLayerStage(*output))
+        last_stage = stages[-1]
+        last_attn_context = torch.empty(context_shape, device=device, dtype=dtype)
+        last_graph, last_output = _capture_cuda_graph(
+            lambda: self._depth_decode_last_post(
+                num_layers - 1,
+                last_stage.residual,
+                last_attn_context,
+            ),
+            device,
+        )
+        post_graphs.append(
+            _DepthDecodeCudaGraphPostStage(
+                graph=last_graph, attn_context=last_attn_context
+            )
+        )
+        return _DepthDecodeCudaGraph(
+            cache_key=self._depth_decode_key(next_input_ids, attention_bias),
+            pre_graph=pre_graph,
+            token_ids=token_ids,
+            cos=cos,
+            sin=sin,
+            positions=positions,
+            stages=tuple(stages),
+            post_graphs=tuple(post_graphs),
+            output=last_output,
+        )
+    def _get_depth_decode_graph(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_length: int,
+        attention_bias: torch.Tensor,
+    ) -> _DepthDecodeCudaGraph:
+        key = self._depth_decode_key(next_input_ids, attention_bias)
+        decode_graph = self.graph
+        if decode_graph is None or decode_graph.cache_key != key:
+            decode_graph = self._build_depth_decode_graph(
+                next_input_ids,
+                past_length=past_length,
+                attention_bias=attention_bias,
+            )
+            self.graph = decode_graph
+        else:
+            decode_graph.token_ids.copy_(next_input_ids)
+            self._select_depth_decode_rope(
+                decode_graph.cos, decode_graph.sin, past_length=past_length
+            )
+        return decode_graph
+    def _run_depth_decode_attention_core(
+        self,
+        layer_idx: int,
+        stage: _DepthDecodeCudaGraphLayerStage,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+        cache_position: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        attention = self.backbone.transformer.blocks[layer_idx].self_attn
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_values.update(
+            stage.key,
+            stage.value,
+            layer_idx,
+            cache_kwargs,
+        )
+        key_states = _repeat_kv(key_states, attention.num_key_value_groups)
+        value_states = _repeat_kv(value_states, attention.num_key_value_groups)
+        attn_output = F.scaled_dot_product_attention(
+            stage.query,
+            key_states,
+            value_states,
+            attn_mask=attention_bias,
+            dropout_p=0.0,
+            is_causal=False,
+        )
+        return attn_output.transpose(1, 2)
+    def run(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+        past_length: int,
+    ) -> Tuple[torch.Tensor, Cache]:
+        end = past_length + 1
+        decode_graph = self._get_depth_decode_graph(
+            next_input_ids,
+            past_length=past_length,
+            attention_bias=attention_bias,
+        )
+        cache_position = decode_graph.positions[past_length:end]
+        attention_bias_q = attention_bias[:, :, past_length:end, :end]
+        decode_graph.pre_graph.replay()
+        for layer_idx, post_graph in enumerate(decode_graph.post_graphs):
+            attn_context = self._run_depth_decode_attention_core(
+                layer_idx,
+                decode_graph.stages[layer_idx],
+                past_key_values=past_key_values,
+                attention_bias=attention_bias_q,
+                cache_position=cache_position,
+                cos=decode_graph.cos,
+                sin=decode_graph.sin,
+            )
+            post_graph.attn_context.copy_(attn_context)
+            post_graph.graph.replay()
+        return decode_graph.output, past_key_values
+def _cuda_graph_tensor_signature(
+    tensor: Optional[torch.Tensor],
+) -> Optional[Tuple[Any, ...]]:
+    if tensor is None:
+        return None
+    return (
+        tuple(tensor.shape),
+        tuple(tensor.stride()),
+        str(tensor.dtype),
+        str(tensor.device),
+    )
+def _cuda_graph_context_signature(context: Any) -> Tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return (
+        tuple((sig(k), sig(v)) for k, v in context.kv_contexts),
+        sig(context.cross_mask),
+        sig(context.self_mask),
+        sig(context.valid_action),
+        None
+        if context.rope_cache is None
+        else tuple(sig(t) for t in context.rope_cache),
+    )
+def _cuda_graph_modulation_signature(modulations: Sequence[Any]) -> Tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return tuple(
+        (
+            sig(step.conditioning),
+            tuple(
+                tuple(sig(t) for t in block_modulation)
+                for block_modulation in step.block_modulations
+            ),
+            tuple(sig(t) for t in step.final_modulation),
+        )
+        for step in modulations
+    )
+def _cuda_graph_key(inputs: _ActionFlowInputs, steps: int) -> Tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return (
+        sig(inputs.trajectory),
+        _cuda_graph_context_signature(inputs.context),
+        _cuda_graph_modulation_signature(inputs.modulations),
+        sig(inputs.action_dim_is_pad),
+        int(steps),
+    )
+def _clone_static_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    if tensor is None:
+        return None
+    static = torch.empty_strided(
+        tuple(tensor.shape),
+        tuple(tensor.stride()),
+        device=tensor.device,
+        dtype=tensor.dtype,
+    )
+    static.copy_(tensor)
+    return static
+def _clone_static_context(context: Any) -> Any:
+    rope_cache = None
+    if context.rope_cache is not None:
+        rope_cache = tuple(_clone_static_tensor(t) for t in context.rope_cache)
+    return context.__class__(
+        kv_contexts=tuple(
+            (_clone_static_tensor(k), _clone_static_tensor(v))
+            for k, v in context.kv_contexts
+        ),
+        cross_mask=_clone_static_tensor(context.cross_mask),
+        self_mask=_clone_static_tensor(context.self_mask),
+        valid_action=_clone_static_tensor(context.valid_action),
+        rope_cache=rope_cache,
+    )
+def _clone_static_modulations(modulations: Sequence[Any]) -> Sequence[Any]:
+    return tuple(
+        step.__class__(
+            conditioning=_clone_static_tensor(step.conditioning),
+            block_modulations=tuple(
+                tuple(_clone_static_tensor(t) for t in block_modulation)
+                for block_modulation in step.block_modulations
+            ),
+            final_modulation=tuple(
+                _clone_static_tensor(t) for t in step.final_modulation
+            ),
+        )
+        for step in modulations
+    )
+def _clone_static_inputs(inputs: _ActionFlowInputs) -> _ActionFlowInputs:
+    return _ActionFlowInputs(
+        trajectory=_clone_static_tensor(inputs.trajectory),
+        context=_clone_static_context(inputs.context),
+        modulations=_clone_static_modulations(inputs.modulations),
+        action_dim_is_pad=_clone_static_tensor(inputs.action_dim_is_pad),
+    )
+def _copy_context_(dst: Any, src: Any) -> None:
+    for (dst_k, dst_v), (src_k, src_v) in zip(dst.kv_contexts, src.kv_contexts):
+        dst_k.copy_(src_k)
+        dst_v.copy_(src_v)
+    if src.cross_mask is not None:
+        dst.cross_mask.copy_(src.cross_mask)
+    if src.self_mask is not None:
+        dst.self_mask.copy_(src.self_mask)
+    if src.valid_action is not None:
+        dst.valid_action.copy_(src.valid_action)
+    if src.rope_cache is not None:
+        for dst_tensor, src_tensor in zip(dst.rope_cache, src.rope_cache):
+            dst_tensor.copy_(src_tensor)
+def _copy_inputs_(dst: _ActionFlowInputs, src: _ActionFlowInputs) -> None:
+    dst.trajectory.copy_(src.trajectory)
+    _copy_context_(dst.context, src.context)
+    if src.action_dim_is_pad is not None:
+        dst.action_dim_is_pad.copy_(src.action_dim_is_pad)
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def _apply_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (_rotate_half(q) * sin)
+    k_embed = (k * cos) + (_rotate_half(k) * sin)
+    return q_embed, k_embed
+def _repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def _capture_cuda_graph(
+    fn,
+    device: torch.device,
+    *,
+    after_warmup=None,
+) -> Tuple[torch.cuda.CUDAGraph, Any]:
+    warmup_stream = torch.cuda.Stream(device=device)
+    warmup_stream.wait_stream(torch.cuda.current_stream(device))
+    with torch.cuda.stream(warmup_stream):
+        fn()
+    torch.cuda.current_stream(device).wait_stream(warmup_stream)
+    if after_warmup is not None:
+        after_warmup()
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        output = fn()
+    return graph, output

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a62d97555bec82618aa1e7c3f143a41067ba7e8bed074d825b9b1e88391d516a
+size 4183452369

modeling_molmoact2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

norm_stats.json ADDED Viewed

	@@ -0,0 +1,202 @@

+{
+  "format": "molmoact2_norm_stats.v1",
+  "norm_mode": "q01_q99",
+  "metadata_by_tag": {
+    "so100_so101_molmoact2": {
+      "action_key": "action",
+      "state_key": "observation.state",
+      "camera_keys": [],
+      "normalize_gripper": true,
+      "action_horizon": 30,
+      "n_action_steps": 30,
+      "setup_type": "single so100/so101 robotic arm in molmoact2",
+      "control_mode": "absolute joint pose",
+      "action_stats": {
+        "min": [
+          -122.607421875,
+          -270.0,
+          -269.208984375,
+          -125.771484375,
+          -269.912109375,
+          -31.57327651977539
+        ],
+        "max": [
+          179.208984375,
+          219.638671875,
+          195.380859375,
+          178.9453125,
+          269.82421875,
+          119.40789031982422
+        ],
+        "mean": [
+          3.343996486826433,
+          125.7905980370996,
+          120.20220128113388,
+          55.88144220174933,
+          -11.543010633027725,
+          11.25886240824774
+        ],
+        "std": [
+          28.909870406169997,
+          52.25069634659296,
+          47.94432906599221,
+          36.01019142727721,
+          69.35504013212369,
+          17.116239869449775
+        ],
+        "count": [
+          19619650.0
+        ],
+        "q01": [
+          -42.1300246338976,
+          45.18258358164995,
+          35.40059182962813,
+          4.929781836327758,
+          -65.57568617645342,
+          -0.3016556932619033
+        ],
+        "q10": [
+          -25.040070398997557,
+          68.27827215165794,
+          65.76540485606242,
+          26.58811186925123,
+          -39.81868441470048,
+          0.26123181871944706
+        ],
+        "q50": [
+          3.0828094324713105,
+          124.5495736487354,
+          122.75175717637279,
+          57.77960070056314,
+          -11.094802886190045,
+          4.866634607477139
+        ],
+        "q90": [
+          31.591544866079253,
+          181.76986724267596,
+          168.5741215400282,
+          82.4353358815596,
+          16.05609349144359,
+          32.12324970648343
+        ],
+        "q99": [
+          48.55349563198916,
+          186.10646680077767,
+          173.6076722013997,
+          93.41056417929472,
+          43.53107398260694,
+          44.74649336930881
+        ],
+        "names": [
+          "shoulder_pan",
+          "shoulder_lift",
+          "elbow_flex",
+          "wrist_flex",
+          "wrist_roll",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true
+        ]
+      },
+      "state_stats": {
+        "min": [
+          -115.048828125,
+          -270.0,
+          -235.8984375,
+          -113.818359375,
+          -268.9453125,
+          -8.521058082580566
+        ],
+        "max": [
+          178.505859375,
+          218.49609375,
+          192.041015625,
+          207.861328125,
+          250.048828125,
+          118.2519302368164
+        ],
+        "mean": [
+          3.3225097946752244,
+          124.40594064960378,
+          121.59550610749059,
+          55.903039878016074,
+          -11.41740021122887,
+          13.358497334686597
+        ],
+        "std": [
+          28.79265204113751,
+          52.702867303079756,
+          47.00596021941705,
+          35.53803566355756,
+          69.12836626047817,
+          16.333280282904557
+        ],
+        "count": [
+          19619650.0
+        ],
+        "q01": [
+          -41.90962240941357,
+          43.66791235922949,
+          38.38770483255723,
+          5.711740446834044,
+          -63.44539045209019,
+          0.9435577790191543
+        ],
+        "q10": [
+          -24.949315993050774,
+          66.30007546431412,
+          68.16816985859437,
+          27.120731646136054,
+          -39.50255020332888,
+          1.6190225837869365
+        ],
+        "q50": [
+          3.066375725640164,
+          123.16482094240277,
+          124.39930058290133,
+          57.88605464633133,
+          -11.037436711677765,
+          9.241478261568748
+        ],
+        "q90": [
+          31.472920732960127,
+          180.87158401301218,
+          168.5699720215359,
+          81.64709150074712,
+          15.887605114617852,
+          31.887861734718296
+        ],
+        "q99": [
+          48.29435703371732,
+          185.2611055842669,
+          173.13578487933165,
+          91.78122415137209,
+          42.94491979114059,
+          44.13755601580974
+        ],
+        "names": [
+          "shoulder_pan",
+          "shoulder_lift",
+          "elbow_flex",
+          "wrist_flex",
+          "wrist_roll",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true
+        ]
+      }
+    }
+  }
+}

processing_molmoact2.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""
+Processor class for MolmoAct2.
+"""
+from typing import Optional, Union
+import dataclasses
+import numpy as np
+from transformers.image_utils import ImageInput
+from transformers.video_utils import VideoInput
+from transformers.processing_utils import (
+    Unpack,
+    ProcessingKwargs,
+    ProcessorMixin,
+)
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
+from transformers.utils import logging
+from transformers import AutoTokenizer
+from .image_processing_molmoact2 import MolmoAct2ImagesKwargs, MolmoAct2ImageProcessor
+from .video_processing_molmoact2 import MolmoAct2VideoProcessorKwargs, MolmoAct2VideoProcessor
+logger = logging.get_logger(__name__)
+# Special tokens, these should be present in any tokenizer we use since the preprocessor uses them
+IMAGE_PATCH_TOKEN = f"<im_patch>"  # Where to insert high-res tokens
+IMAGE_LOW_RES_TOKEN = f"<im_low>"  # Where to insert low-res tokens
+IM_START_TOKEN = f"<im_start>"
+LOW_RES_IMAGE_START_TOKEN = f"<low_res_im_start>"
+FRAME_START_TOKEN = f"<frame_start>"
+IM_END_TOKEN = f"<im_end>"
+FRAME_END_TOKEN= f"<frame_end>"
+IM_COL_TOKEN = f"<im_col>"
+IMAGE_PROMPT = "<|image|>"
+VIDEO_PROMPT = "<|video|>"
+IMAGE_TOKENS = [
+    IMAGE_PATCH_TOKEN,
+    IM_COL_TOKEN,
+    IM_START_TOKEN,
+    LOW_RES_IMAGE_START_TOKEN,
+    FRAME_START_TOKEN,
+    IM_END_TOKEN,
+    FRAME_END_TOKEN,
+    IMAGE_LOW_RES_TOKEN,
+]
+class MolmoAct2ProcessorKwargs(ProcessingKwargs, total=False):
+    """MolmoAct2 processor kwargs"""
+    images_kwargs: MolmoAct2ImagesKwargs
+    videos_kwargs: MolmoAct2VideoProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": True,
+        },
+        "videos_kwargs": {"return_metadata": True},
+    }
+class MolmoAct2Processor(ProcessorMixin):
+    attributes = ["image_processor", "video_processor", "tokenizer"]
+    optional_attributes = [
+        "chat_template",
+        "time_mode",
+        "image_use_col_tokens",
+        "use_single_crop_col_tokens",
+        "use_single_crop_start_token",
+        "video_use_col_tokens",
+        "use_frame_special_tokens",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        image_processor: MolmoAct2ImageProcessor = None,
+        video_processor: MolmoAct2VideoProcessor = None,
+        tokenizer: AutoTokenizer = None,
+        chat_template: Optional[str] = None,
+        image_use_col_tokens: Optional[bool] = True,
+        use_single_crop_col_tokens: Optional[bool] = None,
+        use_single_crop_start_token: Optional[bool] = True,
+        video_use_col_tokens: Optional[bool] = False,
+        use_frame_special_tokens: Optional[bool] = True,
+        **kwargs
+    ) -> None:
+        super().__init__(
+            image_processor,
+            video_processor,
+            tokenizer,
+            chat_template=chat_template,
+        )
+        self.image_use_col_tokens = image_use_col_tokens
+        self.use_single_crop_col_tokens = use_single_crop_col_tokens
+        self.use_single_crop_start_token = use_single_crop_start_token
+        self.video_use_col_tokens = video_use_col_tokens
+        self.use_frame_special_tokens = use_frame_special_tokens
+        self.image_placeholder_token = IMAGE_PROMPT
+        self.video_placeholder_token = VIDEO_PROMPT
+        self.image_token_ids = [
+            tokenizer.convert_tokens_to_ids(token)
+            for token in IMAGE_TOKENS
+        ]
+    def get_image_tokens(self, image_grid: np.ndarray):
+        resized_h, resized_w, height, width = image_grid
+        if int(height) == 0 or int(width) == 0:
+            per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
+            use_single_crop_col_tokens = (
+                self.image_use_col_tokens
+                if self.use_single_crop_col_tokens is None
+                else self.use_single_crop_col_tokens
+            )
+            if use_single_crop_col_tokens:
+                per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+            joint = [
+                [IM_START_TOKEN],
+                np.tile(per_row, [resized_h]),
+                [IM_END_TOKEN],
+            ]
+            return np.concatenate(joint)
+        per_row = np.full(width, IMAGE_PATCH_TOKEN)
+        if self.image_use_col_tokens:
+            per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+        joint = [
+            [IM_START_TOKEN],
+            np.tile(per_row, [height]),
+            [IM_END_TOKEN],
+        ]
+        per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
+        use_single_crop_col_tokens = (
+            self.image_use_col_tokens
+            if self.use_single_crop_col_tokens is None
+            else self.use_single_crop_col_tokens
+        )
+        image_start_token = (
+            LOW_RES_IMAGE_START_TOKEN
+            if self.use_single_crop_start_token
+            else IM_START_TOKEN
+        )
+        if use_single_crop_col_tokens:
+            per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+        joint = [
+            [image_start_token],
+            np.tile(per_row, [resized_h]),
+            [IM_END_TOKEN],
+        ] + joint
+        return np.concatenate(joint)
+    def get_video_string(
+        self,
+        video_grid: np.ndarray,
+        timestamps: np.ndarray,
+    ):
+        if self.use_frame_special_tokens:
+            start_token_id = FRAME_START_TOKEN
+            end_token_id = FRAME_END_TOKEN
+        else:
+            start_token_id = IM_START_TOKEN
+            end_token_id = IM_END_TOKEN
+        num_frames, h, w = video_grid
+        video_string: str = ""
+        for frame_idx, frame_time in enumerate(timestamps):
+            # `per-frame-compact` time mode
+            prev_space = " " if frame_idx > 0 else ""
+            frame_prefix = prev_space + f"{frame_time:.1f} " # explicit whitespace before/after image tokens
+            video_string += frame_prefix
+            per_row = np.full(w, IMAGE_PATCH_TOKEN)
+            if self.video_use_col_tokens:
+                per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+            extra_tokens = np.tile(per_row, [h])
+            video_tokens = [
+                [start_token_id],
+                extra_tokens,
+                [end_token_id],
+            ]
+            video_string += "".join(np.concatenate(video_tokens, 0))
+        return video_string
+    def insert_bos(
+        self,
+        input_ids: np.ndarray,
+        attention_mask: np.ndarray,
+        bos_token_id: int,
+        pad_token_id: int,
+    ):
+        """
+        Args:
+            input_ids: [B, S] array with left padding
+            attention_mask: [B, S] array (0 for pad, 1 for valid)
+            bos_token_id: int
+            pad_token_id: int
+        Returns:
+            input_ids_out: [B, S] or [B, S+1] array with bos inserted if needed
+            attention_mask_out: same shape as input_ids_out
+        """
+        need_to_expand = len(input_ids.shape) == 1
+        if need_to_expand:
+            input_ids = input_ids[None, :]
+            attention_mask = attention_mask[None, :]
+        B, S = input_ids.shape
+        # Handle zero-length sequence
+        if S == 0:
+            new_input_ids = np.full((B, 1), bos_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.ones((B, 1), dtype=attention_mask.dtype)
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+            return new_input_ids, new_attention_mask
+        first_valid_index = (attention_mask == 1).argmax(axis=-1)  # [B]
+        bos_already_present = np.all(input_ids[np.arange(B), first_valid_index] == bos_token_id)
+        if bos_already_present:
+            if need_to_expand:
+                input_ids = input_ids[0]
+                attention_mask = attention_mask[0]
+            return input_ids, attention_mask
+        else:
+            new_input_ids = np.full((B, S+1), pad_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.zeros((B, S+1), dtype=attention_mask.dtype)
+            src_idx = np.tile(np.arange(S), (B, 1))  # [B, S]
+            valid_mask = src_idx >= first_valid_index[:, None]  # [B, S]
+            tgt_idx = src_idx + 1  # shit right
+            batch_idx = np.tile(np.arange(B)[:, None], (1, S))  # [B, S]
+            # flatten valid_positions
+            flat_vals = input_ids[valid_mask]
+            flat_batch = batch_idx[valid_mask]
+            flat_tgt = tgt_idx[valid_mask]
+            new_input_ids[flat_batch, flat_tgt] = flat_vals
+            new_attention_mask[flat_batch, flat_tgt] = 1
+            insert_pos = first_valid_index
+            new_input_ids[np.arange(B), insert_pos] = bos_token_id
+            new_attention_mask[np.arange(B), insert_pos] = 1
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+            return new_input_ids, new_attention_mask
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[MolmoAct2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            videos (`dict[str, Any]` or `list[dict[str, Any]]`):
+                The video or batch of videos to be prepared. Each video can be a dictionary with the following keys:
+                - `"frames"`: `np.ndarray` of shape (T, H, W, 3)
+                - `"timestamps"`: `np.ndarray` of shape (T,)
+                - `"sampled_fps"`: `float` (optional)
+                - `"sampling_augmentation"`: `str` (optional)
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            `BatchFeature`: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **image_token_pooling** -- Indices of the patches in `image_grids` to pool for each token in `image_tokens`.
+              Returned when `images` is not `None`.
+            - **image_grids** -- Grids of images. Returned when `images` is not `None`.
+            - **image_num_crops** -- Number of crops for each image. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **video_token_pooling** -- Indices of the patches in `video_grids` to pool for each token in `video_tokens`.
+              Returned when `videos` is not `None`.
+            - **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            MolmoAct2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+            image_grids = image_inputs["image_grids"]
+        else:
+            image_inputs = {}
+            image_grids = None
+        if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            video_grids = videos_inputs["video_grids"]
+            # If user has not requested video metadata, pop it
+            if "return_metadata" not in kwargs:
+                video_metadata = videos_inputs.pop("video_metadata")
+            else:
+                video_metadata = videos_inputs["video_metadata"]
+        else:
+            videos_inputs = {}
+            video_grids = None
+        if not isinstance(text, list):
+            text = [text]
+        text = text.copy() # below lines change text in-place
+        if image_grids is not None:
+            index = 0
+            for i in range(len(text)):
+                num_images = text[i].count(self.image_placeholder_token)
+                image_grids_i = image_grids[index:index+num_images]
+                for image_grid in image_grids_i:
+                    image_tokens = self.get_image_tokens(image_grid)
+                    image_string = "".join(image_tokens)
+                    text[i] = text[i].replace(self.image_placeholder_token, image_string, 1)
+                index += num_images
+        if video_grids is not None:
+            index = 0
+            for i in range(len(text)):
+                num_videos = text[i].count(self.video_placeholder_token)
+                assert num_videos in {0, 1}, "At most one video is supported for now"
+                video_grids_i = video_grids[index:index+num_videos]
+                metadata_i = video_metadata[index:index+num_videos]
+                for video_grid, metadata in zip(video_grids_i, metadata_i):
+                    video_string = self.get_video_string(
+                        video_grid,
+                        metadata.timestamps,
+                    )
+                    text[i] = text[i].replace(self.video_placeholder_token, video_string, 1)
+                index += num_videos
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        input_ids = text_inputs["input_ids"]
+        attention_mask = text_inputs["attention_mask"]
+        input_ids = np.array(input_ids)
+        attention_mask = np.array(attention_mask)
+        bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        input_ids, attention_mask = self.insert_bos(
+            input_ids, attention_mask, bos, self.tokenizer.pad_token_id
+        )
+        if return_mm_token_type_ids:
+            image_tokens = np.array(self.image_token_ids).astype(input_ids.dtype)
+            token_type_ids = np.any(input_ids[:, :, None] == image_tokens[None, None, :], axis=-1)
+            text_inputs["token_type_ids"] = token_type_ids.tolist()
+        text_inputs["input_ids"] = input_ids.tolist()
+        text_inputs["attention_mask"] = attention_mask.tolist()
+        return BatchFeature(
+            data={**text_inputs, **image_inputs, **videos_inputs},
+            tensor_type=return_tensors,
+        )
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+        Returns:
+            `list[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+MolmoAct2Processor.register_for_auto_class()

processor_config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_molmoact2.MolmoAct2Processor"
+  },
+  "image_processor": {
+    "auto_map": {
+      "AutoImageProcessor": "image_processing_molmoact2.MolmoAct2ImageProcessor",
+      "AutoProcessor": "processing_molmoact2.MolmoAct2Processor"
+    },
+    "crop_mode": "resize",
+    "do_convert_rgb": true,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_processor_type": "MolmoAct2ImageProcessor",
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_crops": 8,
+    "overlap_margins": [
+      4,
+      4
+    ],
+    "patch_size": 14,
+    "pooling_size": [
+      2,
+      2
+    ],
+    "resample": 2,
+    "size": {
+      "height": 378,
+      "width": 378
+    }
+  },
+  "image_use_col_tokens": true,
+  "processor_class": "MolmoAct2Processor",
+  "use_frame_special_tokens": true,
+  "use_single_crop_col_tokens": false,
+  "use_single_crop_start_token": true,
+  "video_processor": {
+    "auto_map": {
+      "AutoProcessor": "processing_molmoact2.MolmoAct2Processor",
+      "AutoVideoProcessor": "video_processing_molmoact2.MolmoAct2VideoProcessor"
+    },
+    "data_format": "channels_first",
+    "default_to_square": true,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "frame_sample_mode": "uniform_last_frame",
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_fps": 2.0,
+    "num_frames": 8,
+    "patch_size": 14,
+    "pooling_size": [
+      3,
+      3
+    ],
+    "resample": 2,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "sampling_fps": 2,
+    "size": {
+      "height": 378,
+      "width": 378
+    },
+    "video_processor_type": "MolmoAct2VideoProcessor"
+  },
+  "video_use_col_tokens": false
+}

quantization_metadata.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "source_repo": "allenai/MolmoAct2-SO100_101",
+  "source_revision": "152569fe57914d97be91055800035f54e250d009",
+  "policy_class": "transformers:AutoModelForImageTextToText",
+  "quantization": {
+    "scheme": "nf4",
+    "backend": "bitsandbytes",
+    "compute_dtype": "bfloat16",
+    "min_params_to_quantize": 4000000,
+    "rule": "Linear modules with >=4_000_000 weight elements rewritten to bnb.nn.Linear4bit; smaller heads kept in compute_dtype (bfloat16).",
+    "runtime_status": "loader-backed (install_prequantized_linears)"
+  },
+  "dropped_state_entries": []
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5395aefc9b1b7f0385d8c86a2f1775e5af81bdfbf9f2d97827ea37921d9f862
+size 11983605

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "add_prefix_space": false,
+  "auto_map": {
+    "AutoProcessor": "processing_molmoact2.MolmoAct2Processor"
+  },
+  "backend": "tokenizers",
+  "bos_token": "<|im_end|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<im_start>",
+    "<im_end>",
+    "<im_patch>",
+    "<im_col>",
+    "<low_res_im_start>",
+    "<|image|>",
+    "<im_low>",
+    "<frame_start>",
+    "<frame_end>",
+    "<|video|>",
+    "<|points|>",
+    "<|token_index|>",
+    "<|vit_index|>",
+    "<|vit_loc|>"
+  ],
+  "is_local": false,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "MolmoAct2Processor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

video_processing_molmoact2.py ADDED Viewed

	@@ -0,0 +1,969 @@

+"""Video processor class for MolmoAct2"""
+from functools import partial
+import os
+import warnings
+from contextlib import redirect_stdout
+from io import BytesIO
+from urllib.parse import urlparse
+from typing import Optional, Union, Callable
+import numpy as np
+import requests
+import einops
+import torch
+import torchvision.transforms
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    validate_kwargs,
+)
+from transformers.video_utils import (
+    VideoInput,
+    is_valid_video,
+    make_batched_videos,
+    make_batched_metadata,
+    VideoMetadata,
+)
+from transformers.processing_utils import Unpack, VideosKwargs
+from transformers.video_processing_utils import BaseVideoProcessor
+from transformers.utils import logging
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.utils import (
+    is_av_available,
+    is_decord_available,
+    is_torchcodec_available,
+    is_yt_dlp_available,
+    TensorType,
+    logging,
+    to_numpy,
+)
+logger = logging.get_logger(__name__)
+MAX_VIDEO_FPS = 8
+def normalize_image(
+    image: np.ndarray,
+    image_mean: list[float],
+    image_std: list[float],
+) -> np.ndarray:
+    if np.allclose(image_mean, [0.5, 0.5, 0.5]) and np.allclose(image_std, [0.5, 0.5, 0.5]):
+        return image * np.asarray(2.0, dtype=np.float32) - np.asarray(1.0, dtype=np.float32)
+    image -= np.array(image_mean, dtype=np.float32)[None, None, :]
+    image /= np.array(image_std, dtype=np.float32)[None, None, :]
+    return image
+def resize_image(
+    image: np.ndarray,
+    desired_output_size: list[int],
+    resample: PILImageResampling,
+) -> np.ndarray:
+    if len(image.shape) == 3:
+        is_video = False
+        image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+    else:
+        is_video = True
+        image = torch.permute(torch.from_numpy(image), [0, 3, 1, 2])
+    dtype = image.dtype
+    if torch.is_floating_point(image):
+        in_min = 0.0
+        in_max = 1.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0.0, 1.0).to(dtype)
+    else:
+        assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(image.dtype)
+        in_min = 0.0
+        in_max = 255.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0, 255).to(dtype)
+    resized = resized.to(torch.float32)
+    resized = (resized - in_min) / (in_max - in_min)
+    if is_video:
+        resized = torch.permute(resized, [0, 2, 3, 1]).numpy()
+    else:
+        resized = torch.permute(resized, [1, 2, 0]).numpy()
+    return resized
+def build_resized_image(
+    image: np.ndarray,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    resized = resize_image(
+        image, base_image_input_size, resample,
+    )
+    resized = normalize_image(resized, image_mean, image_std)
+    if len(resized.shape) == 3:
+        resized = np.expand_dims(resized, 0)
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    resize_idx = np.arange(crop_patch_w*crop_patch_h).reshape([crop_patch_h, crop_patch_w])
+    return resized, resize_idx
+def batch_pixels_to_patches(array: np.ndarray, patch_size: int) -> np.ndarray:
+    """Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
+    if len(array.shape) == 3:
+        n_crops, h, w = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
+        array = np.transpose(array, [0, 1, 3, 2, 4])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size])
+        return array
+    else:
+        n_crops, h, w, c = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
+        array = np.transpose(array, [0, 1, 3, 2, 4, 5])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size*c])
+        return array
+def arange_for_pooling(
+    idx_arr: np.ndarray,
+    pool_h: int,
+    pool_w: int,
+) -> np.ndarray:
+    h_pad = pool_h * ((idx_arr.shape[0] + pool_h - 1) // pool_h) - idx_arr.shape[0]
+    w_pad = pool_w * ((idx_arr.shape[1] + pool_w - 1) // pool_w) - idx_arr.shape[1]
+    idx_arr = np.pad(idx_arr, [[h_pad//2, (h_pad+1)//2], [w_pad//2, (w_pad+1)//2]],
+                     mode='constant',constant_values=-1)
+    return einops.rearrange(
+        idx_arr, "(h dh) (w dw) -> h w (dh dw)", dh=pool_h, dw=pool_w)
+def image_to_patches_and_grids(
+    image: ImageInput,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+    image_pooling_w: int,
+    image_pooling_h: int,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    :return image_grids, the shape of each image after pooling
+    :return crops, the image crops to processes with the ViT
+    :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
+                                patches in `crops` to pool for that token, masked with -1
+    """
+    if isinstance(base_image_input_size, int):
+        base_image_input_size = (base_image_input_size, base_image_input_size)
+    pooling_w = image_pooling_w
+    pooling_h = image_pooling_h
+    resized, resize_idx = build_resized_image(
+        image,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    pooling_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+    h, w = pooling_idx.shape[:2]
+    pooling_idx = pooling_idx.reshape([-1, pooling_h*pooling_w])
+    image_grid = [h, w]
+    return (
+        image_grid,
+        batch_pixels_to_patches(resized, image_patch_size),
+        pooling_idx,
+    )
+def get_candidate_target_fps(
+    video_fps: Union[int, float],
+    sampling_fps: Union[int, float],
+    max_fps: Union[int, float] = MAX_VIDEO_FPS,
+) -> list[float]:
+    """
+    Return the subset of `video_fps` factors that remain multiples of `sampling_fps`.
+    Examples:
+        >>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
+        [2, 6]
+        >>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
+        [1, 5]
+        >>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
+        [2]
+        >>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
+        Traceback (most recent call last):
+            ...
+        ValueError: sampling_fps=2 must divide video_fps=5 to produce consistent frame steps.
+    """
+    video_fps = int(video_fps)
+    sampling_fps = int(sampling_fps)
+    max_fps = int(max_fps)
+    if sampling_fps is None:
+        raise ValueError("sampling_fps must be provided")
+    if video_fps <= 0 or sampling_fps <= 0:
+        raise ValueError(f"video_fps and sampling_fps must be positive (got {video_fps}, {sampling_fps})")
+    if video_fps % sampling_fps != 0:
+        raise ValueError(f"sampling_fps={sampling_fps} must divide video_fps={video_fps}.")
+    candidates = []
+    for candidate in range(sampling_fps, video_fps + 1, sampling_fps):
+        if candidate > max_fps:
+            break
+        if video_fps % candidate == 0:
+            candidates.append(float(candidate))
+    return candidates
+def read_video_decord(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using the Decord backend.
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import from decord
+    import importlib
+    decord = importlib.import_module("decord")
+    vr = decord.VideoReader(uri=video_path, ctx=decord.cpu(0))  # decord has problems with gpu
+    video_fps = vr.get_avg_fps()
+    total_num_frames = len(vr)
+    time_stamps = vr.get_frame_timestamp(list(range(len(vr))))
+    duration = time_stamps[-1][1] - time_stamps[0][0]
+    metadata = VideoMetadata(
+        total_num_frames=int(total_num_frames),
+        fps=float(video_fps),
+        duration=float(duration),
+        video_backend="decord",
+    )
+    target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+    target_timestamps = np.array(target_timestamps)
+    offset = time_stamps[0, 0]
+    ix = np.searchsorted(time_stamps[:, 1], target_timestamps + offset, side='right')
+    ix = np.minimum(ix, len(time_stamps) - 1)
+    video = vr.get_batch(ix).asnumpy()
+    metadata.update(
+        {
+            "frames_indices": target_timestamps * video_fps,
+            "height": video.shape[1],
+            "width": video.shape[2],
+        }
+    )
+    return video, metadata
+def read_video_torchcodec(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using torchcodec decoder.
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import torchcodec
+    import importlib
+    torchcodec = importlib.import_module("torchcodec")
+    decoder = torchcodec.decoders.VideoDecoder(
+        video_path,
+        # Interestingly `exact` mode takes less than approximate when we load the whole video
+        seek_mode="exact",
+        # Allow FFmpeg decide on the number of threads for efficiency
+        num_ffmpeg_threads=0,
+    )
+    # If the first frame starts at > 0, we effectively clip the video starting at that time
+    # since (most) video players would also skip to that time
+    time_offset = decoder.metadata.begin_stream_seconds_from_content
+    # Note this duration does assume we started playing at `time_offset`
+    duration = decoder.metadata.duration_seconds
+    metadata = VideoMetadata(
+        total_num_frames=decoder.metadata.num_frames,
+        fps=decoder.metadata.average_fps,
+        duration=duration,
+        video_backend="torchcodec",
+        height=decoder.metadata.height,
+        width=decoder.metadata.width,
+    )
+    target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+    # Floating point/rounding issues might cause `target_timestamps` to be very slightly
+    # out-of-bounds, to handle this we sanity check then clip them
+    assert all(x >= 0 for x in target_timestamps)
+    assert all(x < duration+1e-6 for x in target_timestamps)
+    # 1e-6 padding since torchcodec can throw out-of-bounds errors even if you ask for the
+    # exact boundary value, we should still get the first/last frame anyway
+    max_timestamp = decoder.metadata.end_stream_seconds_from_content - 1e-6
+    min_timestamp = decoder.metadata.begin_stream_seconds_from_content + 1e-6
+    # Note we avoid using numpy ops here to reduce floating precision issues
+    timestamps = [x + time_offset for x in target_timestamps]
+    timestamps = [max(min_timestamp, min(max_timestamp, x)) for x in timestamps]
+    video = decoder.get_frames_played_at(timestamps).data.numpy().transpose(0, 2, 3, 1)  # Convert to THWC format
+    target_timestamps = np.array(target_timestamps)
+    metadata.frames_indices = target_timestamps * metadata.fps
+    return video, metadata
+def read_video_pyav(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using the PyAV backend.
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import torchcodec
+    import importlib
+    av = importlib.import_module("av")
+    with av.open(video_path) as container:
+        video_stream = container.streams.video[0]
+        fps = video_stream.average_rate or video_stream.guessed_rate
+        it = container.decode(video=0)
+        frames = list(it)
+        stream = container.streams.video[0]
+        start = frames[0].pts * stream.time_base
+        container_end = stream.duration
+        if container_end is not None:
+            container_end *= stream.time_base
+        if container_end is None or container_end < frames[-1].pts:
+            # Some problem with stream duration, so use the frame PTS directly
+            # and guess the duration of the last frame
+            end = frames[-1].pts * stream.time_base + 1/fps
+        else:
+            end = container_end
+        duration = float(end - start)
+        metadata = VideoMetadata(
+            total_num_frames=len(frames),
+            fps=float(fps),
+            duration=float(duration),
+            video_backend="pyav",
+            height=video_stream.height,
+            width=video_stream.width,
+        )
+        target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+        offset = float(start)
+        target_timestamps = np.array(target_timestamps)
+        end_time_stamps = np.array([float(frame.pts * stream.time_base) for frame in frames[1:]] + [duration])
+        indices = np.searchsorted(end_time_stamps, target_timestamps + offset, side='right')
+        indices = np.minimum(indices, len(end_time_stamps) - 1)
+        video = np.stack(
+            [frames[i].to_ndarray(format="rgb24", channel_last=True) for i in indices],
+            axis=0,
+        )
+        metadata.frames_indices = target_timestamps * fps
+        return video, metadata
+VIDEO_DECODERS = {
+    "decord": read_video_decord,
+    "torchcodec": read_video_torchcodec,
+    "pyav": read_video_pyav,
+}
+def load_video(
+    video: VideoInput,
+    backend: str = "decord",
+    sample_timestamps_fn: Optional[Callable] = None,
+    **kwargs,
+):
+    """
+    Loads `video` to a numpy array.
+    Args:
+        video (`VideoInput`):
+            The video to convert to the numpy array format. Can be a link to video or local path.
+        backend (`str`, *optional*, defaults to `"decord"`):
+            The backend to use when loading the video. Can be any of ["decord", "pyav", ""torchcodec"]. Defaults to "decord".
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+    """
+    # Early exit if provided an array or `PIL` frames
+    if not isinstance(video, str):
+        metadata = [None] * len(video)
+        return video, metadata
+    if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]:
+        if not is_yt_dlp_available():
+            raise ImportError("To load a video from YouTube url you have  to install `yt_dlp` first.")
+        # Lazy import from yt_dlp
+        import importlib
+        yt_dlp = importlib.import_module("yt_dlp")
+        buffer = BytesIO()
+        with redirect_stdout(buffer), yt_dlp.YoutubeDL() as f:
+            f.download([video])
+        bytes_obj = buffer.getvalue()
+        file_obj = BytesIO(bytes_obj)
+    elif video.startswith("http://") or video.startswith("https://"):
+        file_obj = BytesIO(requests.get(video).content)
+    elif os.path.isfile(video):
+        file_obj = video
+    else:
+        raise TypeError("Incorrect format used for video. Should be an url linking to an video or a local path.")
+    # can also load with decord, but not cv2/torchvision
+    # both will fail in case of url links
+    video_is_url = video.startswith("http://") or video.startswith("https://")
+    if video_is_url and backend == "opencv":
+        raise ValueError("If you are trying to load a video from URL, you cannot use 'opencv' as backend")
+    if (
+        (not is_decord_available() and backend == "decord")
+        or (not is_torchcodec_available() and backend == "torchcodec")
+        or (not is_av_available() and backend == "pyav")
+    ):
+        raise ImportError(
+            f"You chose backend={backend} for loading the video but the required library is not found in your environment "
+            f"Make sure to install {backend} before loading the video."
+        )
+    video_decoder = VIDEO_DECODERS[backend]
+    video, metadata = video_decoder(file_obj, sample_timestamps_fn, **kwargs)
+    return video, metadata
+def get_target_fps(
+    video_fps: float,
+    max_frames: int,
+    total_frames: int,
+    frame_sample_mode: str,
+    candidate_target_fps: tuple[float],
+) -> float:
+    """
+    Get the target fps that best spans the video and has the most frames sampled
+    """
+    num_frames_sampled = 0
+    selected_target_fps = None
+    for target_fps in candidate_target_fps:
+        step_size = max(int(video_fps / target_fps), 1)
+        num_frames_sampled_at_fps = int(total_frames / step_size)
+        if num_frames_sampled == 0:
+            if "uniform" in frame_sample_mode:
+                if num_frames_sampled_at_fps > max_frames:
+                    break
+            selected_target_fps = target_fps
+            num_frames_sampled = num_frames_sampled_at_fps
+        else:
+            # the candidate sampling fps increases so frame count can't decrease
+            assert num_frames_sampled <= num_frames_sampled_at_fps
+            if num_frames_sampled_at_fps > max_frames:
+                # choose the sampling fps that spans the video
+                continue
+            elif num_frames_sampled_at_fps > num_frames_sampled:
+                # both are less than max_frames, choose the one with higher density of frames sampled
+                selected_target_fps = target_fps
+                num_frames_sampled = num_frames_sampled_at_fps
+    return selected_target_fps
+def get_frame_times_and_chosen_fps(
+    selected_target_fps,
+    total_frames,
+    max_frames,
+    video_fps
+):
+    if selected_target_fps is None:
+        frame_indices = np.linspace(0, total_frames, max_frames, endpoint=False, dtype=int)
+    else:
+        step_size = max(int(video_fps / selected_target_fps), 1)
+        frame_indices = np.arange(0, total_frames, step_size)
+    if len(frame_indices) > max_frames:
+        frame_indices = frame_indices[:max_frames]
+    return selected_target_fps, frame_indices
+class MolmoAct2VideoProcessorKwargs(VideosKwargs, total=False):
+    patch_size: Optional[int]
+    pooling_size: Optional[list[int]]
+    frame_sample_mode: Optional[str]
+    max_fps: Optional[int]
+    sampling_fps: Optional[int]
+class MolmoAct2VideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BILINEAR
+    size = {"height": 378, "width": 378}
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    patch_size = 14
+    pooling_size = [3, 3]
+    do_sample_frames = True
+    frame_sample_mode = "uniform_last_frame"
+    max_fps = 2
+    sampling_fps = 2
+    valid_kwargs = MolmoAct2VideoProcessorKwargs
+    model_input_names = ["pixel_values_videos", "video_token_pooling", "video_grids"]
+    def __init__(self, **kwargs: Unpack[MolmoAct2VideoProcessorKwargs]):
+        super().__init__(**kwargs)
+        if self.size is not None and (
+            self.size.get("height", None) is None or self.size.get("width", None) is None
+        ):
+            raise ValueError("size must contain 'height' and 'width' keys.")
+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if size is not None and ("height" not in size or "width" not in size):
+            raise ValueError("size must contain 'height' and 'width' keys.")
+        return super()._further_process_kwargs(size=size, **kwargs)
+    def sample_times(
+        self,
+        metadata: VideoMetadata,
+        frame_sample_mode: str,
+        num_frames: int,
+        max_fps: Optional[int] = None,
+        sampling_fps: Optional[int] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Time-based sampling if an array video is passed
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            frame_sample_mode (`str`, *optional*):
+                Mode to sample frames. Defaults to `self.frame_sample_mode`.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            man_fps (`int`, *optional*):
+                Maximum frames per second to sample.
+            sampling_fps (`int`, *optional*):
+                Sampling frames per second. Defaults to `self.sampling_fps`.
+                Used when `frame_sample_mode` is `"fps"`.
+        """
+        frame_sample_mode = frame_sample_mode or self.frame_sample_mode
+        num_frames = num_frames or self.num_frames
+        sampling_fps = sampling_fps or self.sampling_fps
+        duration = metadata.duration or metadata.total_num_frames / metadata.fps
+        if frame_sample_mode == "fps":
+            candidate_target_fps = get_candidate_target_fps(metadata.fps, sampling_fps)
+            # Try larger and larger FPSs until we hit one that can't span the video
+            target_fps = candidate_target_fps[0]
+            for candidate_fps in candidate_target_fps[1:]:
+                if num_frames / candidate_fps < duration:
+                    break
+                target_fps = candidate_fps
+            times = np.arange(0, num_frames) / target_fps
+            times = times[times < duration]
+            return times
+        elif frame_sample_mode == "uniform_last_frame":
+            if max_fps is not None:
+                max_duration = (num_frames-1) / max_fps  # -1 to include the last frame
+                if max_duration < duration:
+                    times = np.linspace(
+                        0, duration, num=num_frames, endpoint=True, dtype=np.float64
+                    )
+                else:
+                    times = np.arange(0.0, stop=duration, step=1/max_fps)
+                    times = np.concatenate([times, [duration]], axis=0)
+                    assert len(times) <= num_frames
+            else:
+                times = np.linspace(
+                    0, duration, num=num_frames, endpoint=True, dtype=np.float64
+                )
+            return times
+        else:
+            raise NotImplementedError(frame_sample_mode)
+    def sample_frames(
+        self,
+        metadata: VideoMetadata,
+        frame_sample_mode: Optional[str] = None,
+        num_frames: Optional[int] = None,
+        max_fps: Optional[int] = None,
+        sampling_fps: Optional[int] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Frame-based sampling if an array video is passed
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            frame_sample_mode (`str`, *optional*):
+                Mode to sample frames. Defaults to `self.frame_sample_mode`.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            max_fps (`int`, *optional*):
+                Maximum frames per second to sample.
+            sampling_fps (`int`, *optional*):
+                Sampling frames per second. Defaults to `self.sampling_fps`.
+                Used when `frame_sample_mode` is `"fps"`.
+        """
+        frame_sample_mode = frame_sample_mode or self.frame_sample_mode
+        num_frames = num_frames or self.num_frames
+        sampling_fps = sampling_fps or self.sampling_fps
+        total_num_frames = metadata.total_num_frames
+        if frame_sample_mode == "uniform_last_frame" and max_fps is not None:
+            duration = total_num_frames / metadata.fps
+            if total_num_frames <= 2:
+                return np.arange(total_num_frames).astype(int)
+            if duration > (num_frames - 1) / max_fps:  # -1 to include the last frame
+                # uniform fallback
+                indices = np.linspace(
+                    0,
+                    total_num_frames - 1,
+                    num=min(num_frames, total_num_frames),
+                    endpoint=True,
+                ).astype(int)
+                return indices
+            else:
+                float_indices = np.arange(
+                    0.0, stop=total_num_frames - 1, step=float(metadata.fps / max_fps),
+                )
+                if np.round(float_indices[-1]) != total_num_frames - 1:
+                    float_indices = np.concatenate([float_indices, [total_num_frames - 1]], axis=0)
+                indices = np.round(float_indices).astype(int)
+                assert indices[-1] < total_num_frames
+                assert len(float_indices) <= num_frames
+                return indices
+        elif frame_sample_mode == "uniform_last_frame":
+            indices = np.linspace(
+                0, total_num_frames - 1, num=min(num_frames, total_num_frames), endpoint=True,
+            ).astype(int)
+            return indices
+        elif frame_sample_mode == "fps":
+            candidate_target_fps = get_candidate_target_fps(metadata.fps, sampling_fps)
+            selected_target_fps = get_target_fps(
+                metadata.fps,
+                num_frames,
+                total_num_frames,
+                frame_sample_mode,
+                candidate_target_fps,
+            )
+            _, indices = get_frame_times_and_chosen_fps(
+                selected_target_fps,
+                total_num_frames,
+                num_frames,
+                metadata.fps,
+            )
+            return indices
+        else:
+            raise NotImplementedError(frame_sample_mode)
+    def fetch_videos(
+        self,
+        video_url_or_urls: Union[str, list[str], list[list[str]]],
+        sample_timestamps_fn=None
+    ):
+        """
+        Convert a single or a list of urls into the corresponding `np.array` objects.
+        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
+        returned.
+        """
+        if (
+            (not is_decord_available())
+            and (not is_torchcodec_available())
+            and (not is_av_available())
+        ):
+            raise ImportError(
+                "MolmoAct2VideoProcessor requires `decord`, `torchcodec`, or `av` to be installed."
+            )
+        if is_decord_available():
+            backend = "decord"
+        elif is_torchcodec_available():
+            warnings.warn(
+                "`decord` is not installed and cannot be used to decode the video by default. "
+                "Falling back to `torchcodec`."
+            )
+            backend = "torchcodec"
+        else:
+            warnings.warn(
+                "`decord` is not installed and cannot be used to decode the video by default. "
+                "Falling back to `PyAV`."
+            )
+            backend = "pyav"
+        if isinstance(video_url_or_urls, list):
+            return list(zip(*[self.fetch_videos(x, sample_timestamps_fn=sample_timestamps_fn) for x in video_url_or_urls]))
+        else:
+            return load_video(video_url_or_urls, backend=backend, sample_timestamps_fn=sample_timestamps_fn)
+    def _decode_and_sample_videos(
+        self,
+        videos: VideoInput,
+        video_metadata: Union[VideoMetadata, dict],
+        do_sample_frames: Optional[bool] = None,
+        sample_indices_fn: Optional[Callable] = None,
+        sample_timestamps_fn: Optional[Callable] = None,
+    ):
+        """
+        Decode input videos and sample frames if needed.
+        """
+        videos = make_batched_videos(videos)
+        video_metadata = make_batched_metadata(videos, video_metadata=video_metadata)
+        # Framed-based sampling if an array video is passed
+        # Otherwise, time-based sampling with decoding
+        if is_valid_video(videos[0]) and do_sample_frames:
+            assert video_metadata[0].fps is not None, "FPS must be provided for video input"
+            sampled_videos = []
+            sampled_metadata = []
+            for video, metadata in zip(videos, video_metadata):
+                indices = sample_indices_fn(metadata=metadata)
+                metadata.frames_indices = indices
+                sampled_videos.append(video[indices])
+                sampled_metadata.append(metadata)
+            videos = sampled_videos
+            video_metadata = sampled_metadata
+        elif not is_valid_video(videos[0]):
+            if sample_indices_fn is None:
+                logger.warning(
+                    "do_sample_frames is False, but video array is not provided: "
+                    "Will decode the video and sample frames using MolmoAct2's default sampling mode"
+                )
+            if isinstance(videos[0], list):
+                raise ValueError(
+                    "A list of images is not supported for video input!"
+                )
+            else:
+                videos, video_metadata = self.fetch_videos(videos, sample_timestamps_fn=sample_timestamps_fn)
+        return videos, video_metadata
+    def _prepare_input_videos(
+        self,
+        videos: VideoInput,
+        **kwargs,
+    ) -> list[np.ndarray]:
+        processed_videos = [to_numpy(video) for video in videos]
+        return processed_videos
+    def preprocess(
+        self,
+        videos: VideoInput,
+        **kwargs: Unpack[MolmoAct2VideoProcessorKwargs],
+    ) -> BatchFeature:
+        validate_kwargs(
+            captured_kwargs=kwargs.keys(),
+            valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
+        )
+        # Set default kwargs from self. This ensures that if a kwarg is not provided
+        # by the user, it gets its default value from the instance, or is set to None.
+        for kwarg_name in self.valid_kwargs.__annotations__:
+            kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
+        do_sample_frames = kwargs.pop("do_sample_frames")
+        video_metadata = kwargs.pop("video_metadata")
+        sample_indices_fn = partial(self.sample_frames, **kwargs) if do_sample_frames else None
+        sample_timestamps_fn = partial(self.sample_times, **kwargs)
+        videos, video_metadata = self._decode_and_sample_videos(
+            videos,
+            video_metadata=video_metadata,
+            do_sample_frames=do_sample_frames,
+            sample_indices_fn=sample_indices_fn,
+            sample_timestamps_fn=sample_timestamps_fn,
+        )
+        videos = self._prepare_input_videos(videos=videos)
+        kwargs = self._further_process_kwargs(**kwargs)
+        return_metadata = kwargs.pop("return_metadata")
+        preprocessed_videos = self._preprocess(videos=videos, **kwargs)
+        if return_metadata:
+            preprocessed_videos["video_metadata"] = video_metadata
+        return preprocessed_videos
+    def _preprocess(
+        self,
+        videos: list[np.ndarray],
+        size: Optional[SizeDict] = None,
+        resample: Optional[PILImageResampling] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        patch_size: Optional[int] = None,
+        pooling_size: Optional[list[int]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess a video for the model.
+        Args:
+            videos (`VideoInput`):
+                Video to preprocess.
+            size (`SizeDict`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use when resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            pooling_size (`list[int]`, *optional*, defaults to `self.pooling_size`):
+                The pooling size of the vision adapter.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+        Returns:
+            A `BatchFeature` containing the following keys:
+                - `pixel_values_videos`: The preprocessed videos.
+                - `video_token_pooling`: The indices of the patches in `crops` to pool for each token in `video_tokens`.
+                - `video_grids`: The video grids.
+        """
+        if size.height is None or size.width is None:
+            raise ValueError("size must contain 'height' and 'width' keys.")
+        base_image_input_size = [size.height, size.width]
+        resample = resample or self.resample
+        image_mean = image_mean or self.image_mean
+        image_std = image_std or self.image_std
+        do_convert_rgb = do_convert_rgb or self.do_convert_rgb
+        patch_size = patch_size or self.patch_size
+        pooling_size = pooling_size or self.pooling_size
+        image_pooling_h, image_pooling_w = pooling_size
+        batch_grids = []
+        batch_crops = []
+        batch_pooled_patches_idx = []
+        for video in videos:
+            all_crops = []
+            pooled_patches_idx = []
+            for frame in video:
+                image_grid, crops, pooled_idx = image_to_patches_and_grids(
+                    frame,
+                    base_image_input_size,
+                    resample,
+                    image_mean,
+                    image_std,
+                    patch_size,
+                    image_pooling_w,
+                    image_pooling_h,
+                )
+                offset = sum(np.prod(x.shape[:2]) for x in all_crops)
+                pooled_idx_with_offset = np.where(pooled_idx >= 0, pooled_idx + offset, pooled_idx)
+                pooled_patches_idx.append(pooled_idx_with_offset)
+                all_crops.append(crops)
+            video_grid = np.array([len(video), image_grid[0], image_grid[1]])
+            all_crops = np.concatenate(all_crops, 0)
+            pooled_patches_idx = np.concatenate(pooled_patches_idx, 0)
+            batch_grids.append(video_grid)
+            batch_crops.append(all_crops)
+            batch_pooled_patches_idx.append(pooled_patches_idx)
+        video_grids = np.stack(batch_grids, 0)
+        pixel_values_videos = np.concatenate(batch_crops, 0)
+        video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
+        data =dict(
+            pixel_values_videos=pixel_values_videos,
+            video_token_pooling=video_token_pooling,
+            video_grids=video_grids,
+        )
+        return BatchFeature(data, tensor_type=return_tensors)
+MolmoAct2VideoProcessor.register_for_auto_class()