Upload 7 files

Browse files

Files changed (7) hide show

config.json +63 -0
configuration_rio2.py +200 -0
modeling_rio2.py +1364 -0
processing_rio2.py +128 -0
processor_config.json +7 -0
rio2_export_manifest.json +11 -0
runtime_config.json +8 -0

config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "base_model_id": "allenai/MolmoAct2-SO100_101",
+  "norm_tag": "so100_so101_molmoact2",
+  "state_dim": 6,
+  "action_dim": 6,
+  "action_horizon": 30,
+  "state_history_len": 8,
+  "action_history_len": 8,
+  "s2_token_count": 16,
+  "s2_input_width": 2560,
+  "s2_width": 1024,
+  "s1_width": 384,
+  "s1_layers": 6,
+  "s1_heads": 8,
+  "s1_dropout": 0.05,
+  "flow_inference_steps": 4,
+  "temporal_ensemble_enabled": true,
+  "temporal_ensemble_max_chunks": 4,
+  "temporal_ensemble_decay": 0.15,
+  "task_memory_enabled": true,
+  "task_memory_slots": 8,
+  "task_memory_ema": 0.97,
+  "task_memory_alpha": 0.25,
+  "task_memory_max_norm": 10.0,
+  "s1_policy_mode": "jepa_diffusion",
+  "enable_jepa_diffusion": true,
+  "diffusion_inference_steps": 1,
+  "diffusion_loss_weight": 1.0,
+  "consistency_loss_weight": 0.5,
+  "flow_loss_weight": 0.1,
+  "jepa_loss_weight": 0.1,
+  "jepa_action_prior_weight": 0.05,
+  "jepa_hidden_dim": 256,
+  "jepa_latent_dim": 256,
+  "jepa_ema_decay": 0.996,
+  "jepa_action_prior_alpha": 0.25,
+  "jepa_condition_alpha": 1.0,
+  "s1_sampling_noise_scale": 1.0,
+  "enable_s1_moe": true,
+  "s1_moe_num_experts": 10,
+  "s1_moe_top_k": 1,
+  "s1_moe_expert_hidden_dim": 177000,
+  "s1_moe_residual_scale": 0.1,
+  "dtype": "bfloat16",
+  "s2_refresh_hz": 8.0,
+  "max_s2_cache_age_s": 0.2,
+  "action_clip": 1.0,
+  "smooth_loss_weight": 0.02,
+  "action_l1_weight": 0.0,
+  "residual_alpha": 1.0,
+  "model_type": "rio2",
+  "architectures": [
+    "Rio2Model"
+  ],
+  "weight_format": "safetensors",
+  "weight_file": "model.safetensors",
+  "runtime_type": "two_rate_weight_preserved",
+  "auto_map": {
+    "AutoConfig": "configuration_rio2.Rio2Config",
+    "AutoModel": "modeling_rio2.Rio2Model",
+    "AutoProcessor": "processing_rio2.Rio2Processor"
+  }
+}

configuration_rio2.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright 2026 The HuggingFace Inc. team and the Rio2 contributors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+"""RIO-2 configuration."""
+from transformers.configuration_utils import PreTrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Rio2Config(PreTrainedConfig):
+    r"""
+    Configuration class for [`Rio2Model`].
+    RIO-2 is a SO101 robotics policy with real-time
+    - S2: low-frequency semantic/context refresh
+    - S1: high-frequency action generation, preferably using the larger module
+    """
+    model_type = "rio2"
+    attribute_map = {
+        "hidden_size": "s1_width",
+        "num_attention_heads": "s1_heads",
+        "num_hidden_layers": "s1_layers",
+    }
+    def __init__(
+        self,
+        base_model_id="allenai/MolmoAct2-SO100_101",
+        norm_tag="so100_so101_molmoact2",
+        rio2_variant="weight_preserved",
+        runtime_mode="two_rate_weight_preserved",
+        state_dim=6,
+        action_dim=6,
+        action_horizon=30,
+        state_history_len=8,
+        action_history_len=8,
+        # Compact token fallback path. These remain for tests and for cases
+        # where the original action expert cannot be called directly.
+        s2_token_count=16,
+        s2_input_width=4096,
+        s2_width=1024,
+        s1_width=384,
+        s1_layers=6,
+        s1_heads=8,
+        s1_dropout=0.05,
+        flow_inference_steps=4,
+        temporal_ensemble_enabled=True,
+        temporal_ensemble_max_chunks=4,
+        temporal_ensemble_decay=0.15,
+        task_memory_enabled=True,
+        task_memory_slots=8,
+        task_memory_ema=0.97,
+        task_memory_alpha=0.25,
+        task_memory_max_norm=10.0,
+        # Weight-preserved MolmoAct2 path.
+        use_original_s2=True,
+        use_original_s1=True,
+        prefer_split_action_expert=True,
+        fallback_to_predict_action=True,
+        action_mode="continuous",
+        molmoact_num_steps=10,
+        s2_refresh_hz=8.0,
+        max_s2_cache_age_s=0.20,
+        action_clip=1.0,
+        # JEPA-style S1. This keeps the original/online S1 policy weights as
+        # the action generator and adds a small latent world-model side head.
+        # The target action encoder is updated by EMA and is used only for the
+        # self-supervised JEPA loss.
+        s1_architecture="jepa_diffusion",
+        enable_jepa_s1=False,
+        jepa_hidden_dim=256,
+        jepa_latent_dim=256,
+        jepa_layers=2,
+        jepa_heads=4,
+        jepa_loss_weight=0.10,
+        jepa_ema_decay=0.996,
+        use_jepa_action_residual=False,
+        jepa_action_alpha=0.0,
+        s1_policy_mode="jepa_diffusion",
+        enable_jepa_diffusion=True,
+        diffusion_inference_steps=1,
+        diffusion_loss_weight=1.0,
+        consistency_loss_weight=0.50,
+        flow_loss_weight=0.10,
+        jepa_action_prior_weight=0.05,
+        jepa_action_prior_alpha=0.25,
+        jepa_condition_alpha=1.0,
+        s1_sampling_noise_scale=1.0,
+        enable_s1_moe=False,
+        s1_moe_num_experts=10,
+        s1_moe_top_k=1,
+        s1_moe_expert_hidden_dim=105472,
+        s1_moe_residual_scale=0.10,
+        # Tiny tuning knobs.
+        train_adapters_only=True,
+        enable_residual_adapter=True,
+        residual_alpha=0.0,
+        residual_trainable=True,
+        enable_s1_lora=False,
+        enable_s2_lora=False,
+        lora_r=8,
+        lora_alpha=16,
+        # Training losses for fallback/adapter path.
+        smooth_loss_weight=0.02,
+        action_l1_weight=0.0,
+        torch_dtype="bfloat16",
+        load_base_on_init=False,
+        trust_remote_code=True,
+        **kwargs,
+    ):
+        self.base_model_id = base_model_id
+        self.norm_tag = norm_tag
+        self.rio2_variant = rio2_variant
+        self.runtime_mode = runtime_mode
+        self.state_dim = state_dim
+        self.action_dim = action_dim
+        self.action_horizon = action_horizon
+        self.state_history_len = state_history_len
+        self.action_history_len = action_history_len
+        self.s2_token_count = s2_token_count
+        self.s2_input_width = s2_input_width
+        self.s2_width = s2_width
+        self.s1_width = s1_width
+        self.s1_layers = s1_layers
+        self.s1_heads = s1_heads
+        self.s1_dropout = s1_dropout
+        self.flow_inference_steps = flow_inference_steps
+        self.temporal_ensemble_enabled = temporal_ensemble_enabled
+        self.temporal_ensemble_max_chunks = temporal_ensemble_max_chunks
+        self.temporal_ensemble_decay = temporal_ensemble_decay
+        self.task_memory_enabled = task_memory_enabled
+        self.task_memory_slots = task_memory_slots
+        self.task_memory_ema = task_memory_ema
+        self.task_memory_alpha = task_memory_alpha
+        self.task_memory_max_norm = task_memory_max_norm
+        self.use_original_s2 = use_original_s2
+        self.use_original_s1 = use_original_s1
+        self.prefer_split_action_expert = prefer_split_action_expert
+        self.fallback_to_predict_action = fallback_to_predict_action
+        self.action_mode = action_mode
+        self.molmoact_num_steps = molmoact_num_steps
+        self.s2_refresh_hz = s2_refresh_hz
+        self.max_s2_cache_age_s = max_s2_cache_age_s
+        self.action_clip = action_clip
+        self.s1_architecture = s1_architecture
+        self.enable_jepa_s1 = enable_jepa_s1
+        self.jepa_hidden_dim = jepa_hidden_dim
+        self.jepa_latent_dim = jepa_latent_dim
+        self.jepa_layers = jepa_layers
+        self.jepa_heads = jepa_heads
+        self.jepa_loss_weight = jepa_loss_weight
+        self.jepa_ema_decay = jepa_ema_decay
+        self.use_jepa_action_residual = use_jepa_action_residual
+        self.jepa_action_alpha = jepa_action_alpha
+        self.s1_policy_mode = s1_policy_mode
+        self.enable_jepa_diffusion = enable_jepa_diffusion
+        self.diffusion_inference_steps = diffusion_inference_steps
+        self.diffusion_loss_weight = diffusion_loss_weight
+        self.consistency_loss_weight = consistency_loss_weight
+        self.flow_loss_weight = flow_loss_weight
+        self.jepa_action_prior_weight = jepa_action_prior_weight
+        self.jepa_action_prior_alpha = jepa_action_prior_alpha
+        self.jepa_condition_alpha = jepa_condition_alpha
+        self.s1_sampling_noise_scale = s1_sampling_noise_scale
+        self.enable_s1_moe = enable_s1_moe
+        self.s1_moe_num_experts = s1_moe_num_experts
+        self.s1_moe_top_k = s1_moe_top_k
+        self.s1_moe_expert_hidden_dim = s1_moe_expert_hidden_dim
+        self.s1_moe_residual_scale = s1_moe_residual_scale
+        self.train_adapters_only = train_adapters_only
+        self.enable_residual_adapter = enable_residual_adapter
+        self.residual_alpha = residual_alpha
+        self.residual_trainable = residual_trainable
+        self.enable_s1_lora = enable_s1_lora
+        self.enable_s2_lora = enable_s2_lora
+        self.lora_r = lora_r
+        self.lora_alpha = lora_alpha
+        self.smooth_loss_weight = smooth_loss_weight
+        self.action_l1_weight = action_l1_weight
+        self.torch_dtype = torch_dtype
+        self.load_base_on_init = load_base_on_init
+        self.trust_remote_code = trust_remote_code
+        super().__init__(**kwargs)
+__all__ = ["Rio2Config"]

modeling_rio2.py ADDED Viewed

	@@ -0,0 +1,1364 @@

+# Copyright 2026 The HuggingFace Inc. team and the Rio2 contributors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+"""PyTorch RIO-2 model.
+Runtime modes:
+  - `refresh_s2(images, instruction)`: low-frequency context refresh.
+  - `act_fast(state, ...)`: high-frequency action generation.
+  - `forward(..., s2_tokens=...)`: cached-token fallback used for tests and
+    for adapter-only training when MolmoAct2 internals are unavailable.
+"""
+from __future__ import annotations
+import copy
+import inspect
+import math
+import time
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Any
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from transformers.modeling_outputs import ModelOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_rio2 import Rio2Config
+logger = logging.get_logger(__name__)
+ImageLike = Image.Image | np.ndarray | torch.Tensor
+@dataclass
+class Rio2Output(ModelOutput):
+    """Output type for RIO-2."""
+    loss: torch.FloatTensor | None = None
+    actions: torch.FloatTensor | None = None
+    s2_tokens: torch.FloatTensor | None = None
+    loss_flow_mse: torch.FloatTensor | None = None
+    loss_flow_l1: torch.FloatTensor | None = None
+    loss_diffusion: torch.FloatTensor | None = None
+    loss_consistency: torch.FloatTensor | None = None
+    loss_smooth: torch.FloatTensor | None = None
+    loss_jepa: torch.FloatTensor | None = None
+    loss_jepa_prior: torch.FloatTensor | None = None
+    pred_action_latent: torch.FloatTensor | None = None
+    target_action_latent: torch.FloatTensor | None = None
+    runtime_path: str | None = None
+def _torch_dtype_from_string(dtype_name: str) -> torch.dtype:
+    table = {
+        "float32": torch.float32,
+        "fp32": torch.float32,
+        "float16": torch.float16,
+        "fp16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "bf16": torch.bfloat16,
+    }
+    return table.get(str(dtype_name).lower(), torch.bfloat16)
+def _to_pil_list(images: ImageLike | list[ImageLike] | tuple[ImageLike, ...]) -> list[Image.Image]:
+    if isinstance(images, (list, tuple)):
+        return [_to_pil_list(x)[0] for x in images]
+    if isinstance(images, Image.Image):
+        return [images.convert("RGB")]
+    if isinstance(images, np.ndarray):
+        arr = images
+        if arr.ndim == 4:
+            return [_to_pil_list(a)[0] for a in arr]
+        if arr.ndim == 3 and arr.shape[0] in (1, 3, 4) and arr.shape[-1] not in (1, 3, 4):
+            arr = np.transpose(arr, (1, 2, 0))
+        if arr.ndim == 2:
+            arr = np.repeat(arr[..., None], 3, axis=-1)
+        if arr.ndim == 3 and arr.shape[-1] == 1:
+            arr = np.repeat(arr, 3, axis=-1)
+        if arr.ndim == 3 and arr.shape[-1] == 4:
+            arr = arr[..., :3]
+        if arr.dtype != np.uint8:
+            arr = np.clip(arr, 0, 1) if arr.max() <= 1.5 else np.clip(arr, 0, 255)
+            arr = (arr * 255).astype(np.uint8) if arr.max() <= 1.5 else arr.astype(np.uint8)
+        return [Image.fromarray(arr).convert("RGB")]
+    if torch.is_tensor(images):
+        x = images.detach().cpu()
+        if x.ndim == 4:
+            return [_to_pil_list(xx)[0] for xx in x]
+        if x.ndim == 3 and x.shape[0] in (1, 3):
+            x = x.permute(1, 2, 0)
+        arr = x.numpy()
+        if arr.dtype != np.uint8:
+            arr = np.clip(arr, 0, 1) if arr.max() <= 1.5 else np.clip(arr, 0, 255)
+            arr = (arr * 255).astype(np.uint8) if arr.max() <= 1.5 else arr.astype(np.uint8)
+        return [Image.fromarray(arr).convert("RGB")]
+    raise TypeError(f"Unsupported image type: {type(images)}")
+def _move_to_device(batch: Any, device: torch.device, dtype: torch.dtype | None = None) -> Any:
+    if torch.is_tensor(batch):
+        if batch.is_floating_point() and dtype is not None:
+            return batch.to(device=device, dtype=dtype)
+        return batch.to(device=device)
+    if isinstance(batch, dict):
+        return {k: _move_to_device(v, device, dtype) for k, v in batch.items()}
+    if isinstance(batch, (list, tuple)):
+        return type(batch)(_move_to_device(v, device, dtype) for v in batch)
+    return batch
+def _first_existing_attr(obj: Any, names: Iterable[str]) -> Any | None:
+    for name in names:
+        cur = obj
+        ok = True
+        for part in name.split("."):
+            if hasattr(cur, part):
+                cur = getattr(cur, part)
+            else:
+                ok = False
+                break
+        if ok:
+            return cur
+    return None
+def _safe_signature_accepts(fn: Any, name: str) -> bool:
+    try:
+        sig = inspect.signature(fn)
+    except (TypeError, ValueError):
+        return True
+    if name in sig.parameters:
+        return True
+    return any(p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values())
+class Rio2RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.float()
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        return (self.weight * hidden_states).to(input_dtype)
+class Rio2SinusoidalTimeEmbedding(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dim = dim
+        self.mlp = nn.Sequential(nn.Linear(dim, dim * 2), nn.SiLU(), nn.Linear(dim * 2, dim))
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        if timesteps.ndim == 0:
+            timesteps = timesteps[None]
+        half_dim = self.dim // 2
+        freqs = torch.exp(
+            torch.arange(half_dim, device=timesteps.device, dtype=torch.float32)
+            * -(math.log(10000.0) / max(half_dim - 1, 1))
+        )
+        args = timesteps.float()[:, None] * freqs[None]
+        emb = torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
+        if emb.shape[-1] < self.dim:
+            emb = F.pad(emb, (0, self.dim - emb.shape[-1]))
+        return self.mlp(emb.to(dtype=self.mlp[0].weight.dtype))
+class Rio2S1MoEResidualExpert(nn.Module):
+    def __init__(self, width: int, hidden_dim: int, flat_action_dim: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(width, hidden_dim),
+            nn.SiLU(),
+            nn.Linear(hidden_dim, flat_action_dim),
+        )
+        nn.init.zeros_(self.net[-1].weight)
+        nn.init.zeros_(self.net[-1].bias)
+    def forward(self, context: torch.Tensor) -> torch.Tensor:
+        return self.net(context)
+class Rio2S1MoEResidualBank(nn.Module):
+    def __init__(self, config: Rio2Config, width: int):
+        super().__init__()
+        self.config = config
+        self.flat_action_dim = int(config.action_horizon * config.action_dim)
+        self.num_experts = int(config.s1_moe_num_experts)
+        self.top_k = max(1, min(int(config.s1_moe_top_k), self.num_experts))
+        hidden_dim = int(config.s1_moe_expert_hidden_dim)
+        self.router = nn.Linear(width, self.num_experts)
+        self.experts = nn.ModuleList(
+            Rio2S1MoEResidualExpert(width, hidden_dim, self.flat_action_dim)
+            for _ in range(self.num_experts)
+        )
+    def forward(self, context: torch.Tensor) -> torch.Tensor:
+        logits = self.router(context)
+        weights, indices = torch.topk(logits, k=self.top_k, dim=-1)
+        weights = torch.softmax(weights, dim=-1).to(dtype=context.dtype)
+        out = context.new_zeros(context.shape[0], self.flat_action_dim)
+        for slot in range(self.top_k):
+            slot_indices = indices[:, slot]
+            slot_weights = weights[:, slot]
+            for expert_id, expert in enumerate(self.experts):
+                mask = slot_indices == expert_id
+                if not bool(mask.any()):
+                    continue
+                out[mask] = out[mask] + slot_weights[mask, None] * expert(context[mask])
+        return out.view(context.shape[0], self.config.action_horizon, self.config.action_dim)
+class Rio2S2ContextCompressor(nn.Module):
+    """Fallback compressor for cached-token training.
+    Weight-preserved inference prefers the original MolmoAct2 S2/S1 bridge.
+    This compressor remains useful for small adapter training, tests, and for
+    base versions whose action expert cannot be split cleanly.
+    """
+    def __init__(self, config: Rio2Config):
+        super().__init__()
+        self.config = config
+        self.in_proj = nn.Linear(config.s2_input_width, config.s2_width)
+        self.query = nn.Parameter(torch.randn(config.s2_token_count, config.s2_width) / math.sqrt(config.s2_width))
+        layer = nn.TransformerEncoderLayer(
+            d_model=config.s2_width,
+            nhead=max(1, min(8, config.s2_width // 64)),
+            dim_feedforward=config.s2_width * 4,
+            dropout=0.0,
+            batch_first=True,
+            norm_first=True,
+            activation="gelu",
+        )
+        self.refiner = nn.TransformerEncoder(layer, num_layers=2)
+        self.norm = Rio2RMSNorm(config.s2_width)
+    def forward(self, context: torch.Tensor) -> torch.Tensor:
+        if context.ndim == 2:
+            context = context.unsqueeze(0)
+        if context.shape[-1] != self.config.s2_input_width:
+            raise ValueError(
+                f"S2 context width mismatch: got {context.shape[-1]}, expected {self.config.s2_input_width}."
+            )
+        hidden_states = self.in_proj(context)
+        query = self.query.unsqueeze(0).expand(hidden_states.shape[0], -1, -1)
+        scores = (query @ hidden_states.transpose(-1, -2)) / math.sqrt(hidden_states.shape[-1])
+        attn = torch.softmax(scores, dim=-1)
+        tokens = attn @ hidden_states
+        tokens = self.refiner(tokens)
+        return self.norm(tokens)
+class Rio2MolmoAct2Core(nn.Module):
+    """Weight-preserved wrapper around `allenai/MolmoAct2-SO100_101`.
+    The original MolmoAct2 object is loaded once and kept as the source of truth
+    for both S2 and S1. `refresh_s2()` extracts cache/context when possible;
+    `act_original()` first tries a split action-expert call and falls back to
+    `base.predict_action()` for exact original behavior.
+    """
+    VLM_CANDIDATES = (
+        "vlm",
+        "language_model",
+        "molmo",
+        "backbone",
+        "model",
+        "text_model",
+    )
+    ACTION_CANDIDATES = (
+        "action_expert",
+        "flow_head",
+        "action_head",
+        "continuous_action_expert",
+        "flow_matching_head",
+        "policy_head",
+        "robot_action_head",
+    )
+    def __init__(self, config: Rio2Config):
+        super().__init__()
+        self.config = config
+        self.base = None
+        self.processor = None
+        self.s2_module = None
+        self.s1_module = None
+        self.compressor = Rio2S2ContextCompressor(config)
+        self.last_pil_images: list[Image.Image] | None = None
+        self.last_instruction: str | None = None
+        self.last_base_outputs: Any | None = None
+        self.last_s2_cache: Any | None = None
+        self.last_compact_tokens: torch.Tensor | None = None
+        self.last_refresh_time: float = 0.0
+        self.last_runtime_path: str = "uninitialized"
+    @property
+    def base_device(self) -> torch.device:
+        if self.base is None:
+            return next(self.compressor.parameters()).device
+        return next(self.base.parameters()).device
+    def load_base(self, device: str | torch.device | None = None, device_map: str | None = None):
+        from transformers import AutoModelForImageTextToText, AutoProcessor
+        dtype = _torch_dtype_from_string(self.config.torch_dtype)
+        self.processor = AutoProcessor.from_pretrained(
+            self.config.base_model_id,
+            trust_remote_code=self.config.trust_remote_code,
+        )
+        kwargs = {"trust_remote_code": self.config.trust_remote_code, "dtype": dtype}
+        if device_map is not None:
+            kwargs["device_map"] = device_map
+        try:
+            self.base = AutoModelForImageTextToText.from_pretrained(self.config.base_model_id, **kwargs)
+        except TypeError:
+            kwargs.pop("dtype", None)
+            kwargs["torch_dtype"] = dtype
+            self.base = AutoModelForImageTextToText.from_pretrained(self.config.base_model_id, **kwargs)
+        if device is not None and device_map is None:
+            self.base.to(device)
+        self.base.eval()
+        self.s2_module = _first_existing_attr(self.base, self.VLM_CANDIDATES)
+        self.s1_module = _first_existing_attr(self.base, self.ACTION_CANDIDATES)
+        if self.s2_module is None:
+            logger.warning("RIO-2 could not locate a named MolmoAct2 S2/VLM module; full base forward will be used.")
+        if self.s1_module is None:
+            logger.warning("RIO-2 could not locate a named MolmoAct2 action expert; predict_action fallback will be used.")
+        return self
+    def freeze_base(self):
+        if self.base is not None:
+            self.base.eval()
+            for param in self.base.parameters():
+                param.requires_grad = False
+    def unfreeze_action_expert(self):
+        if self.s1_module is None:
+            return 0
+        count = 0
+        for param in self.s1_module.parameters():
+            param.requires_grad = True
+            count += param.numel()
+        return count
+    def unfreeze_adapters_only(self):
+        self.freeze_base()
+        for param in self.compressor.parameters():
+            param.requires_grad = True
+    def _extract_sequence_context(self, outputs: Any) -> torch.Tensor | None:
+        if outputs is None:
+            return None
+        if hasattr(outputs, "hidden_states") and outputs.hidden_states is not None:
+            return outputs.hidden_states[-1]
+        if hasattr(outputs, "last_hidden_state") and outputs.last_hidden_state is not None:
+            return outputs.last_hidden_state
+        if isinstance(outputs, dict):
+            if outputs.get("hidden_states") is not None:
+                return outputs["hidden_states"][-1]
+            if outputs.get("last_hidden_state") is not None:
+                return outputs["last_hidden_state"]
+        if hasattr(outputs, "past_key_values") and outputs.past_key_values is not None:
+            chunks = []
+            for layer in outputs.past_key_values:
+                if isinstance(layer, (tuple, list)) and len(layer) >= 2:
+                    key, value = layer[0], layer[1]
+                    chunks.append(key.float().mean(dim=(-3, -2)))
+                    chunks.append(value.float().mean(dim=(-3, -2)))
+            if chunks:
+                return torch.stack(chunks, dim=1).to(dtype=chunks[0].dtype)
+        return None
+    def _extract_cache(self, outputs: Any) -> Any:
+        if outputs is None:
+            return None
+        for name in ("past_key_values", "kv_cache", "cache", "action_cache", "vlm_cache"):
+            if hasattr(outputs, name) and getattr(outputs, name) is not None:
+                return getattr(outputs, name)
+            if isinstance(outputs, dict) and outputs.get(name) is not None:
+                return outputs[name]
+        return outputs
+    @torch.no_grad()
+    def refresh_s2(self, images: ImageLike | list[ImageLike], instruction: str, force: bool = False) -> torch.Tensor:
+        if self.base is None or self.processor is None:
+            raise RuntimeError("MolmoAct2 base is not loaded. Call model.load_s2_base() first.")
+        age = time.time() - self.last_refresh_time
+        if (
+            not force
+            and self.last_compact_tokens is not None
+            and self.last_instruction == instruction
+            and age < self.config.max_s2_cache_age_s
+        ):
+            return self.last_compact_tokens
+        pil_images = _to_pil_list(images)
+        inputs = self.processor(images=pil_images, text=instruction, return_tensors="pt")
+        inputs = _move_to_device(inputs, self.base_device, _torch_dtype_from_string(self.config.torch_dtype))
+        try:
+            outputs = self.base(**inputs, use_cache=True, output_hidden_states=True, return_dict=True)
+        except TypeError:
+            outputs = self.base(**inputs, return_dict=True)
+        self.last_base_outputs = outputs
+        self.last_s2_cache = self._extract_cache(outputs)
+        self.last_pil_images = pil_images
+        self.last_instruction = instruction
+        self.last_refresh_time = time.time()
+        sequence_context = self._extract_sequence_context(outputs)
+        if sequence_context is not None:
+            sequence_context = sequence_context.to(
+                device=next(self.compressor.parameters()).device,
+                dtype=next(self.compressor.parameters()).dtype,
+            )
+            try:
+                self.last_compact_tokens = self.compressor(sequence_context).detach()
+            except Exception as exc:
+                logger.warning("RIO-2 compact-token compression failed: %s", exc)
+                self.last_compact_tokens = torch.zeros(
+                    1,
+                    self.config.s2_token_count,
+                    self.config.s2_width,
+                    device=next(self.compressor.parameters()).device,
+                    dtype=next(self.compressor.parameters()).dtype,
+                )
+        else:
+            self.last_compact_tokens = torch.zeros(
+                1,
+                self.config.s2_token_count,
+                self.config.s2_width,
+                device=next(self.compressor.parameters()).device,
+                dtype=next(self.compressor.parameters()).dtype,
+            )
+        return self.last_compact_tokens
+    def _try_split_action_expert(
+        self,
+        state: torch.Tensor,
+        state_history: torch.Tensor | None,
+        action_history: torch.Tensor | None,
+        num_steps: int,
+    ) -> torch.Tensor | None:
+        if not self.config.prefer_split_action_expert or self.s1_module is None:
+            return None
+        candidates = [self.s1_module]
+        for method_name in ("predict_action", "sample", "generate_actions", "forward"):
+            if hasattr(self.s1_module, method_name):
+                candidates.append(getattr(self.s1_module, method_name))
+        for fn in candidates:
+            try:
+                kwargs = {}
+                if _safe_signature_accepts(fn, "state"):
+                    kwargs["state"] = state
+                if _safe_signature_accepts(fn, "states"):
+                    kwargs["states"] = state
+                if _safe_signature_accepts(fn, "vlm_kv_cache"):
+                    kwargs["vlm_kv_cache"] = self.last_s2_cache
+                if _safe_signature_accepts(fn, "past_key_values"):
+                    kwargs["past_key_values"] = self.last_s2_cache
+                if _safe_signature_accepts(fn, "s2_cache"):
+                    kwargs["s2_cache"] = self.last_s2_cache
+                if _safe_signature_accepts(fn, "state_history"):
+                    kwargs["state_history"] = state_history
+                if _safe_signature_accepts(fn, "action_history"):
+                    kwargs["action_history"] = action_history
+                if _safe_signature_accepts(fn, "num_steps"):
+                    kwargs["num_steps"] = num_steps
+                if _safe_signature_accepts(fn, "num_flow_steps"):
+                    kwargs["num_flow_steps"] = num_steps
+                out = fn(**kwargs) if kwargs else fn(state)
+                actions = self._coerce_actions(out, state)
+                if actions is not None:
+                    self.last_runtime_path = "split_original_action_expert"
+                    return actions
+            except Exception as exc:
+                logger.debug("RIO-2 split action expert attempt failed for %s: %s", fn, exc)
+        return None
+    def _coerce_actions(self, out: Any, state: torch.Tensor) -> torch.Tensor | None:
+        if out is None:
+            return None
+        if torch.is_tensor(out):
+            actions = out
+        elif hasattr(out, "actions"):
+            actions = torch.as_tensor(out.actions, device=state.device)
+        elif isinstance(out, dict) and out.get("actions") is not None:
+            actions = torch.as_tensor(out["actions"], device=state.device)
+        else:
+            return None
+        if actions.ndim == 2:
+            actions = actions.unsqueeze(0)
+        return actions.to(device=state.device, dtype=state.dtype if state.is_floating_point() else torch.float32)
+    @torch.no_grad()
+    def predict_action_fallback(self, state: torch.Tensor, num_steps: int) -> torch.Tensor:
+        if self.base is None or self.processor is None:
+            raise RuntimeError("MolmoAct2 base is not loaded.")
+        if self.last_pil_images is None or self.last_instruction is None:
+            raise RuntimeError("S2 cache is empty. Call refresh_s2(images, instruction) first.")
+        if not hasattr(self.base, "predict_action"):
+            raise RuntimeError("MolmoAct2 base has no predict_action method and split action expert was unavailable.")
+        state_np = state.detach().float().cpu().numpy()
+        out = self.base.predict_action(
+            processor=self.processor,
+            images=self.last_pil_images,
+            task=self.last_instruction,
+            state=state_np,
+            norm_tag=self.config.norm_tag,
+            action_mode=self.config.action_mode,
+            num_steps=num_steps,
+        )
+        actions = torch.as_tensor(out.actions, device=state.device, dtype=state.dtype if state.is_floating_point() else torch.float32)
+        if actions.ndim == 2:
+            actions = actions.unsqueeze(0)
+        self.last_runtime_path = "predict_action_fallback_exact"
+        return actions
+    @torch.no_grad()
+    def act_original(
+        self,
+        state: torch.Tensor,
+        state_history: torch.Tensor | None = None,
+        action_history: torch.Tensor | None = None,
+        num_steps: int | None = None,
+    ) -> torch.Tensor:
+        steps = int(num_steps or self.config.molmoact_num_steps)
+        split_actions = self._try_split_action_expert(state, state_history, action_history, steps)
+        if split_actions is not None:
+            return split_actions
+        if self.config.fallback_to_predict_action:
+            return self.predict_action_fallback(state, steps)
+        raise RuntimeError("No callable original S1/action path was found and fallback_to_predict_action=False.")
+class Rio2FastS1FlowActionExpert(nn.Module):
+    """Small fallback S1 for cached-token training.
+    In weight-preserved RIO-2, this is not the preferred runtime path. It remains
+    as an adapter/student fallback and for upstream tests without downloading
+    MolmoAct2.
+    """
+    def __init__(self, config: Rio2Config):
+        super().__init__()
+        self.config = config
+        width = config.s1_width
+        self.s2_proj = nn.Linear(config.s2_width, width)
+        self.state_proj = nn.Linear(config.state_dim, width)
+        self.state_hist_proj = nn.Linear(config.state_dim, width)
+        self.action_hist_proj = nn.Linear(config.action_dim, width)
+        self.noisy_action_proj = nn.Linear(config.action_dim, width)
+        self.time_emb = Rio2SinusoidalTimeEmbedding(width)
+        self.type_emb = nn.Parameter(torch.randn(5, width) / math.sqrt(width))
+        self.memory_proj = nn.Linear(config.s2_width, width)
+        self.memory_type_emb = nn.Parameter(torch.randn(1, width) / math.sqrt(width))
+        self.memory_gate = nn.Parameter(torch.tensor(-2.0))
+        layer = nn.TransformerEncoderLayer(
+            d_model=width,
+            nhead=config.s1_heads,
+            dim_feedforward=width * 4,
+            dropout=config.s1_dropout,
+            batch_first=True,
+            norm_first=True,
+            activation="gelu",
+        )
+        self.blocks = nn.TransformerEncoder(layer, num_layers=config.s1_layers)
+        self.norm = Rio2RMSNorm(width)
+        self.action_head = nn.Sequential(nn.Linear(width, width), nn.SiLU(), nn.Linear(width, config.action_dim))
+        self.noise_head = nn.Sequential(nn.Linear(width, width), nn.SiLU(), nn.Linear(width, config.action_dim))
+        hidden = int(config.jepa_hidden_dim)
+        latent = int(config.jepa_latent_dim)
+        self.jepa_s2_proj = nn.Linear(config.s2_width, hidden)
+        self.jepa_memory_proj = nn.Linear(config.s2_width, hidden)
+        self.jepa_state_proj = nn.Linear(config.state_dim, hidden)
+        self.jepa_action_hist_proj = nn.Linear(config.action_dim, hidden)
+        self.jepa_norm = Rio2RMSNorm(hidden)
+        self.jepa_predictor = nn.Sequential(nn.Linear(hidden, hidden), nn.SiLU(), nn.Linear(hidden, latent))
+        flat_action_dim = config.action_horizon * config.action_dim
+        self.action_encoder = nn.Sequential(nn.Linear(flat_action_dim, hidden), nn.SiLU(), nn.Linear(hidden, latent))
+        self.target_action_encoder = copy.deepcopy(self.action_encoder)
+        for param in self.target_action_encoder.parameters():
+            param.requires_grad = False
+        self.jepa_to_action_prior = nn.Sequential(
+            nn.Linear(latent, hidden),
+            nn.SiLU(),
+            nn.Linear(hidden, flat_action_dim),
+        )
+        self.consistency_head = nn.Sequential(
+            nn.Linear(latent, hidden),
+            nn.SiLU(),
+            nn.Linear(hidden, flat_action_dim),
+        )
+        self.jepa_condition_proj = nn.Linear(latent, width)
+        nn.init.zeros_(self.jepa_to_action_prior[-1].weight)
+        nn.init.zeros_(self.jepa_to_action_prior[-1].bias)
+        nn.init.zeros_(self.jepa_condition_proj.weight)
+        nn.init.zeros_(self.jepa_condition_proj.bias)
+        self.moe_residual = Rio2S1MoEResidualBank(config, width) if bool(config.enable_s1_moe) else None
+    def default_task_memory_from_s2(self, s2_tokens):
+        if s2_tokens.ndim == 2:
+            s2_tokens = s2_tokens.unsqueeze(0)
+        batch_size, token_count, width = s2_tokens.shape
+        slots = max(1, int(self.config.task_memory_slots))
+        if token_count >= slots:
+            return s2_tokens[:, :slots]
+        pad_value = s2_tokens.mean(dim=1, keepdim=True).expand(batch_size, slots - token_count, width)
+        return torch.cat([s2_tokens, pad_value], dim=1)
+    def _prepare_task_memory(self, task_memory, s2_tokens, batch_size, device, dtype):
+        if not bool(self.config.task_memory_enabled):
+            return None
+        if task_memory is None:
+            task_memory = self.default_task_memory_from_s2(s2_tokens)
+        if task_memory.ndim == 2:
+            task_memory = task_memory.unsqueeze(0)
+        task_memory = task_memory.to(device=device, dtype=dtype)
+        if task_memory.shape[0] == 1 and batch_size > 1:
+            task_memory = task_memory.expand(batch_size, -1, -1)
+        elif task_memory.shape[0] != batch_size:
+            task_memory = task_memory[:1].expand(batch_size, -1, -1)
+        slots = max(1, int(self.config.task_memory_slots))
+        if task_memory.shape[1] < slots:
+            pad = task_memory.mean(dim=1, keepdim=True).expand(batch_size, slots - task_memory.shape[1], task_memory.shape[2])
+            task_memory = torch.cat([task_memory, pad], dim=1)
+        return task_memory[:, :slots]
+    def _prepare_hist(self, values, length, dim, batch_size, device, dtype):
+        if values is None:
+            return torch.zeros(batch_size, length, dim, device=device, dtype=dtype)
+        if values.ndim == 2:
+            values = values.unsqueeze(0)
+        values = values.to(device=device, dtype=dtype)
+        if values.shape[1] < length:
+            pad = torch.zeros(values.shape[0], length - values.shape[1], values.shape[2], device=device, dtype=dtype)
+            values = torch.cat([pad, values], dim=1)
+        return values[:, -length:]
+    def _decode(self, s2_tokens, state, state_history, action_history, noisy_actions, timesteps, head, jepa_latent=None, task_memory=None):
+        if state.ndim == 1:
+            state = state.unsqueeze(0)
+        if noisy_actions.ndim == 2:
+            noisy_actions = noisy_actions.unsqueeze(0)
+        if s2_tokens.ndim == 2:
+            s2_tokens = s2_tokens.unsqueeze(0)
+        batch_size = state.shape[0]
+        device = state.device
+        dtype = state.dtype if state.is_floating_point() else torch.float32
+        state = state.to(device=device, dtype=dtype)
+        noisy_actions = noisy_actions.to(device=device, dtype=dtype)
+        s2_tokens = s2_tokens.to(device=device, dtype=dtype)
+        state_history = self._prepare_hist(state_history, self.config.state_history_len, self.config.state_dim, batch_size, device, dtype)
+        action_history = self._prepare_hist(action_history, self.config.action_history_len, self.config.action_dim, batch_size, device, dtype)
+        task_memory = self._prepare_task_memory(task_memory, s2_tokens, batch_size, device, dtype)
+        s2_tok = self.s2_proj(s2_tokens) + self.type_emb[0]
+        token_chunks = [s2_tok]
+        if task_memory is not None:
+            gate = torch.sigmoid(self.memory_gate).to(dtype=s2_tok.dtype)
+            mem_tok = gate * float(self.config.task_memory_alpha) * self.memory_proj(task_memory) + self.memory_type_emb
+            token_chunks.append(mem_tok)
+        state_tok = self.state_proj(state).unsqueeze(1) + self.type_emb[1]
+        state_hist_tok = self.state_hist_proj(state_history) + self.type_emb[2]
+        action_hist_tok = self.action_hist_proj(action_history) + self.type_emb[3]
+        action_tok = self.noisy_action_proj(noisy_actions) + self.type_emb[4]
+        action_tok = action_tok + self.time_emb(timesteps).unsqueeze(1)
+        if jepa_latent is not None and bool(self.config.enable_jepa_diffusion):
+            cond = self.jepa_condition_proj(jepa_latent.to(device=device, dtype=dtype)).unsqueeze(1)
+            action_tok = action_tok + float(self.config.jepa_condition_alpha) * cond.to(dtype=action_tok.dtype)
+        token_chunks.extend([state_tok, state_hist_tok, action_hist_tok, action_tok])
+        tokens = torch.cat(token_chunks, dim=1)
+        tokens = self.blocks(tokens)
+        tokens = self.norm(tokens)
+        return head(tokens[:, -self.config.action_horizon :])
+    def velocity(self, s2_tokens, state, state_history, action_history, noisy_actions, timesteps, jepa_latent=None, task_memory=None):
+        return self._decode(s2_tokens, state, state_history, action_history, noisy_actions, timesteps, self.action_head, jepa_latent, task_memory)
+    def diffusion_noise(self, s2_tokens, state, state_history, action_history, noisy_actions, timesteps, jepa_latent=None, task_memory=None):
+        return self._decode(s2_tokens, state, state_history, action_history, noisy_actions, timesteps, self.noise_head, jepa_latent, task_memory)
+    def predict_action_latent(self, s2_tokens, state, action_history=None, task_memory=None):
+        if state.ndim == 1:
+            state = state.unsqueeze(0)
+        if s2_tokens.ndim == 2:
+            s2_tokens = s2_tokens.unsqueeze(0)
+        batch_size = state.shape[0]
+        device = state.device
+        dtype = state.dtype if state.is_floating_point() else torch.float32
+        action_history = self._prepare_hist(action_history, self.config.action_history_len, self.config.action_dim, batch_size, device, dtype)
+        s2_summary = s2_tokens.to(device=device, dtype=dtype).mean(dim=1)
+        task_memory = self._prepare_task_memory(task_memory, s2_tokens, batch_size, device, dtype)
+        memory_summary = torch.zeros_like(s2_summary) if task_memory is None else task_memory.mean(dim=1)
+        hist_summary = action_history.mean(dim=1)
+        memory_scale = torch.sigmoid(self.memory_gate).to(dtype=s2_summary.dtype) * float(self.config.task_memory_alpha)
+        context = (
+            self.jepa_s2_proj(s2_summary)
+            + memory_scale * self.jepa_memory_proj(memory_summary)
+            + self.jepa_state_proj(state.to(dtype=dtype))
+            + self.jepa_action_hist_proj(hist_summary)
+        )
+        return self.jepa_predictor(self.jepa_norm(context))
+    def moe_action_residual(self, s2_tokens, state, action_history=None, task_memory=None):
+        if self.moe_residual is None:
+            return None
+        if state.ndim == 1:
+            state = state.unsqueeze(0)
+        if s2_tokens.ndim == 2:
+            s2_tokens = s2_tokens.unsqueeze(0)
+        batch_size = state.shape[0]
+        device = state.device
+        dtype = state.dtype if state.is_floating_point() else torch.float32
+        action_history = self._prepare_hist(action_history, self.config.action_history_len, self.config.action_dim, batch_size, device, dtype)
+        s2_tokens = s2_tokens.to(device=device, dtype=dtype)
+        task_memory = self._prepare_task_memory(task_memory, s2_tokens, batch_size, device, dtype)
+        context = (
+            self.s2_proj(s2_tokens).mean(dim=1)
+            + self.state_proj(state.to(dtype=dtype))
+            + self.action_hist_proj(action_history).mean(dim=1)
+        )
+        if task_memory is not None:
+            gate = torch.sigmoid(self.memory_gate).to(dtype=context.dtype)
+            context = context + gate * float(self.config.task_memory_alpha) * self.memory_proj(task_memory).mean(dim=1)
+        return self.moe_residual(context).to(dtype=dtype)
+    def encode_action_latent(self, actions, target=False):
+        if actions.ndim == 2:
+            actions = actions.unsqueeze(0)
+        flat = actions.reshape(actions.shape[0], -1)
+        encoder = self.target_action_encoder if target else self.action_encoder
+        return F.normalize(encoder(flat).float(), dim=-1).to(dtype=flat.dtype)
+    def action_prior_from_latent(self, latent, dtype):
+        prior = self.jepa_to_action_prior(latent).view(latent.shape[0], self.config.action_horizon, self.config.action_dim)
+        return prior.to(dtype=dtype)
+    def consistency_action_from_latent(self, latent, dtype):
+        actions = self.consistency_head(latent).view(latent.shape[0], self.config.action_horizon, self.config.action_dim)
+        return actions.to(dtype=dtype)
+    def jepa_diffusion_sample(self, s2_tokens, state, state_history=None, action_history=None, steps=None, task_memory=None):
+        if state.ndim == 1:
+            state = state.unsqueeze(0)
+        batch_size = state.shape[0]
+        dtype = state.dtype if state.is_floating_point() else torch.float32
+        jepa_latent = self.predict_action_latent(s2_tokens, state, action_history, task_memory)
+        x = self.consistency_action_from_latent(jepa_latent, dtype)
+        if float(self.config.jepa_action_prior_alpha) != 0.0:
+            x = x + float(self.config.jepa_action_prior_alpha) * self.action_prior_from_latent(jepa_latent, dtype)
+        moe_residual = self.moe_action_residual(s2_tokens, state, action_history, task_memory)
+        if moe_residual is not None:
+            x = x + float(self.config.s1_moe_residual_scale) * moe_residual
+        denoise_steps = int(steps if steps is not None else self.config.diffusion_inference_steps)
+        denoise_steps = max(0, denoise_steps)
+        if denoise_steps > 0:
+            x = x + torch.randn_like(x) * float(self.config.s1_sampling_noise_scale) / float(denoise_steps + 1)
+        for i in range(denoise_steps):
+            frac = float(denoise_steps - i) / float(max(denoise_steps, 1))
+            timesteps = torch.full((batch_size,), frac, device=state.device, dtype=dtype)
+            eps = self.diffusion_noise(s2_tokens, state, state_history, action_history, x, timesteps, jepa_latent, task_memory)
+            x = x - eps / float(denoise_steps + 1)
+        return x
+    @torch.no_grad()
+    def update_target_encoder(self, decay=None):
+        decay = float(self.config.jepa_ema_decay if decay is None else decay)
+        for online, target in zip(self.action_encoder.parameters(), self.target_action_encoder.parameters()):
+            target.data.mul_(decay).add_(online.data, alpha=1.0 - decay)
+    def freeze_target_encoder(self):
+        for param in self.target_action_encoder.parameters():
+            param.requires_grad = False
+    def training_loss(self, s2_tokens, state, state_history, action_history, target_actions, task_memory=None):
+        if target_actions.ndim == 2:
+            target_actions = target_actions.unsqueeze(0)
+        batch_size = target_actions.shape[0]
+        jepa_latent = self.predict_action_latent(s2_tokens, state, action_history, task_memory) if bool(self.config.enable_jepa_diffusion) else None
+        x0 = torch.randn_like(target_actions)
+        x1 = target_actions
+        timesteps = torch.rand(batch_size, device=target_actions.device, dtype=target_actions.dtype)
+        xt = (1.0 - timesteps[:, None, None]) * x0 + timesteps[:, None, None] * x1
+        target_velocity = x1 - x0
+        pred_velocity = self.velocity(s2_tokens, state, state_history, action_history, xt, timesteps, jepa_latent, task_memory)
+        loss_flow_mse = F.mse_loss(pred_velocity, target_velocity)
+        loss_flow_l1 = F.l1_loss(pred_velocity, target_velocity)
+        if bool(self.config.enable_jepa_diffusion) and float(self.config.diffusion_loss_weight) > 0:
+            diffusion_t = torch.rand(batch_size, device=target_actions.device, dtype=target_actions.dtype)
+            eps = torch.randn_like(target_actions)
+            alpha = torch.cos(diffusion_t[:, None, None] * (math.pi / 2.0))
+            sigma = torch.sin(diffusion_t[:, None, None] * (math.pi / 2.0))
+            noisy = alpha * target_actions + sigma * eps
+            pred_eps = self.diffusion_noise(s2_tokens, state, state_history, action_history, noisy, diffusion_t, jepa_latent, task_memory)
+            loss_diffusion = F.mse_loss(pred_eps, eps)
+        else:
+            loss_diffusion = target_actions.new_tensor(0.0)
+        if bool(self.config.enable_jepa_diffusion) and float(self.config.jepa_loss_weight) > 0:
+            pred_latent = F.normalize(jepa_latent.float(), dim=-1)
+            with torch.no_grad():
+                target_latent = self.encode_action_latent(target_actions, target=True).float()
+            loss_jepa = F.mse_loss(pred_latent, target_latent)
+        else:
+            loss_jepa = target_actions.new_tensor(0.0)
+        if bool(self.config.enable_jepa_diffusion) and float(self.config.jepa_action_prior_weight) > 0:
+            prior_actions = self.action_prior_from_latent(jepa_latent, target_actions.dtype)
+            loss_jepa_prior = F.mse_loss(prior_actions, target_actions)
+        else:
+            loss_jepa_prior = target_actions.new_tensor(0.0)
+        if bool(self.config.enable_jepa_diffusion) and float(self.config.consistency_loss_weight) > 0:
+            consistency_actions = self.consistency_action_from_latent(jepa_latent, target_actions.dtype)
+            moe_residual = self.moe_action_residual(s2_tokens, state, action_history, task_memory)
+            if moe_residual is not None:
+                consistency_actions = consistency_actions + float(self.config.s1_moe_residual_scale) * moe_residual
+            loss_consistency = F.mse_loss(consistency_actions, target_actions)
+        else:
+            loss_consistency = target_actions.new_tensor(0.0)
+        loss_smooth = (target_actions[:, 1:] - target_actions[:, :-1]).pow(2).mean() if target_actions.shape[1] > 1 else target_actions.new_tensor(0.0)
+        loss = (
+            self.config.flow_loss_weight * (loss_flow_mse + self.config.action_l1_weight * loss_flow_l1)
+            + self.config.smooth_loss_weight * loss_smooth
+            + self.config.diffusion_loss_weight * loss_diffusion
+            + self.config.consistency_loss_weight * loss_consistency
+            + self.config.jepa_loss_weight * loss_jepa.to(loss_flow_mse.dtype)
+            + self.config.jepa_action_prior_weight * loss_jepa_prior
+        )
+        return {
+            "loss": loss,
+            "loss_flow_mse": loss_flow_mse,
+            "loss_flow_l1": loss_flow_l1,
+            "loss_diffusion": loss_diffusion,
+            "loss_consistency": loss_consistency,
+            "loss_jepa": loss_jepa,
+            "loss_jepa_prior": loss_jepa_prior,
+            "loss_smooth": loss_smooth,
+        }
+    @torch.no_grad()
+    def sample(self, s2_tokens, state, state_history=None, action_history=None, steps=None, task_memory=None):
+        if state.ndim == 1:
+            state = state.unsqueeze(0)
+        if self.config.s1_policy_mode == "jepa_diffusion" and bool(self.config.enable_jepa_diffusion):
+            x = self.jepa_diffusion_sample(s2_tokens, state, state_history, action_history, steps=steps, task_memory=task_memory)
+            if self.config.action_clip > 0:
+                x = x.clamp(-self.config.action_clip, self.config.action_clip)
+            return x
+        batch_size = state.shape[0]
+        steps = steps or self.config.flow_inference_steps
+        dtype = state.dtype if state.is_floating_point() else torch.float32
+        jepa_latent = self.predict_action_latent(s2_tokens, state, action_history, task_memory) if bool(self.config.enable_jepa_diffusion) else None
+        x = torch.randn(batch_size, self.config.action_horizon, self.config.action_dim, device=state.device, dtype=dtype)
+        x = x * float(self.config.s1_sampling_noise_scale)
+        if jepa_latent is not None and float(self.config.jepa_action_prior_alpha) != 0.0:
+            x = x + float(self.config.jepa_action_prior_alpha) * self.action_prior_from_latent(jepa_latent, dtype)
+        moe_residual = self.moe_action_residual(s2_tokens, state, action_history, task_memory)
+        if moe_residual is not None:
+            x = x + float(self.config.s1_moe_residual_scale) * moe_residual
+        for i in range(steps):
+            timesteps = torch.full((batch_size,), float(i) / max(steps, 1), device=state.device, dtype=x.dtype)
+            x = x + self.velocity(s2_tokens, state, state_history, action_history, x, timesteps, jepa_latent, task_memory) / float(steps)
+        if self.config.action_clip > 0:
+            x = x.clamp(-self.config.action_clip, self.config.action_clip)
+        return x
+class Rio2JepaS1ActionExpert(nn.Module):
+    """JEPA-style S1 that preserves the online S1 policy weights.
+    This module does **not** replace the original S1 policy with an unrelated
+    world model. Instead it wraps the existing fast flow S1 as `online_s1` and
+    adds a small latent prediction side objective:
+    - online_s1: action generator; initialized and trained exactly like the
+      existing RIO-2 S1 path, so old S1 checkpoints can be remapped into it.
+    - jepa_context_encoder + predictor: predicts future action latent from
+      S2 tokens, current state, and action history.
+    - target_action_encoder: EMA target encoder for the future action chunk.
+    - latent_to_action_delta: optional zero-initialized residual head.
+    Inference defaults to the online S1 policy. JEPA affects actions only when
+    `config.use_jepa_action_residual=True` and `config.jepa_action_alpha > 0`.
+    """
+    def __init__(self, config: Rio2Config):
+        super().__init__()
+        self.config = config
+        self.online_s1 = Rio2FastS1FlowActionExpert(config)
+        hidden = int(config.jepa_hidden_dim)
+        latent = int(config.jepa_latent_dim)
+        self.s2_jepa_proj = nn.Linear(config.s2_width, hidden)
+        self.state_jepa_proj = nn.Linear(config.state_dim, hidden)
+        self.action_hist_jepa_proj = nn.Linear(config.action_dim, hidden)
+        self.type_emb = nn.Parameter(torch.randn(3, hidden) / math.sqrt(hidden))
+        layer = nn.TransformerEncoderLayer(
+            d_model=hidden,
+            nhead=max(1, int(config.jepa_heads)),
+            dim_feedforward=hidden * 4,
+            dropout=config.s1_dropout,
+            batch_first=True,
+            norm_first=True,
+            activation="gelu",
+        )
+        self.jepa_context_encoder = nn.TransformerEncoder(layer, num_layers=max(1, int(config.jepa_layers)))
+        self.jepa_norm = Rio2RMSNorm(hidden)
+        self.jepa_predictor = nn.Sequential(
+            nn.Linear(hidden, hidden),
+            nn.SiLU(),
+            nn.Linear(hidden, latent),
+        )
+        flat_action_dim = config.action_horizon * config.action_dim
+        self.action_encoder = nn.Sequential(
+            nn.Linear(flat_action_dim, hidden),
+            nn.SiLU(),
+            nn.Linear(hidden, latent),
+        )
+        self.target_action_encoder = copy.deepcopy(self.action_encoder)
+        for param in self.target_action_encoder.parameters():
+            param.requires_grad = False
+        self.latent_to_action_delta = nn.Sequential(
+            nn.Linear(latent, hidden),
+            nn.SiLU(),
+            nn.Linear(hidden, flat_action_dim),
+        )
+        nn.init.zeros_(self.latent_to_action_delta[-1].weight)
+        nn.init.zeros_(self.latent_to_action_delta[-1].bias)
+    def _prepare_action_history(self, action_history, batch_size, device, dtype):
+        if action_history is None:
+            return torch.zeros(batch_size, self.config.action_history_len, self.config.action_dim, device=device, dtype=dtype)
+        if action_history.ndim == 2:
+            action_history = action_history.unsqueeze(0)
+        action_history = action_history.to(device=device, dtype=dtype)
+        if action_history.shape[1] < self.config.action_history_len:
+            pad = torch.zeros(
+                action_history.shape[0],
+                self.config.action_history_len - action_history.shape[1],
+                action_history.shape[2],
+                device=device,
+                dtype=dtype,
+            )
+            action_history = torch.cat([pad, action_history], dim=1)
+        return action_history[:, -self.config.action_history_len :]
+    def encode_context(self, s2_tokens, state, action_history=None):
+        if state.ndim == 1:
+            state = state.unsqueeze(0)
+        if s2_tokens.ndim == 2:
+            s2_tokens = s2_tokens.unsqueeze(0)
+        batch_size = state.shape[0]
+        device = state.device
+        dtype = state.dtype if state.is_floating_point() else torch.float32
+        s2_tokens = s2_tokens.to(device=device, dtype=dtype)
+        state = state.to(device=device, dtype=dtype)
+        action_history = self._prepare_action_history(action_history, batch_size, device, dtype)
+        s2_tok = self.s2_jepa_proj(s2_tokens) + self.type_emb[0]
+        state_tok = self.state_jepa_proj(state).unsqueeze(1) + self.type_emb[1]
+        hist_tok = self.action_hist_jepa_proj(action_history) + self.type_emb[2]
+        tokens = torch.cat([s2_tok, state_tok, hist_tok], dim=1)
+        hidden = self.jepa_context_encoder(tokens)
+        hidden = self.jepa_norm(hidden)
+        return hidden.mean(dim=1)
+    def predict_action_latent(self, s2_tokens, state, action_history=None):
+        context = self.encode_context(s2_tokens, state, action_history)
+        return self.jepa_predictor(context)
+    def encode_action_latent(self, actions: torch.Tensor, target: bool = False) -> torch.Tensor:
+        if actions.ndim == 2:
+            actions = actions.unsqueeze(0)
+        flat = actions.reshape(actions.shape[0], -1)
+        encoder = self.target_action_encoder if target else self.action_encoder
+        latent = encoder(flat)
+        return F.normalize(latent.float(), dim=-1).to(dtype=flat.dtype)
+    @torch.no_grad()
+    def update_target_encoder(self, decay: float | None = None):
+        decay = float(self.config.jepa_ema_decay if decay is None else decay)
+        for online, target in zip(self.action_encoder.parameters(), self.target_action_encoder.parameters()):
+            target.data.mul_(decay).add_(online.data, alpha=1.0 - decay)
+    def freeze_target_encoder(self):
+        if hasattr(self.online_s1, "freeze_target_encoder"):
+            self.online_s1.freeze_target_encoder()
+        for param in self.target_action_encoder.parameters():
+            param.requires_grad = False
+    def training_loss(self, s2_tokens, state, state_history, action_history, target_actions, task_memory=None):
+        base_losses = self.online_s1.training_loss(s2_tokens, state, state_history, action_history, target_actions, task_memory=task_memory)
+        pred_latent = F.normalize(self.predict_action_latent(s2_tokens, state, action_history).float(), dim=-1)
+        with torch.no_grad():
+            target_latent = self.encode_action_latent(target_actions, target=True).float()
+        loss_jepa = F.mse_loss(pred_latent, target_latent)
+        loss = base_losses["loss"] + float(self.config.jepa_loss_weight) * loss_jepa.to(base_losses["loss"].dtype)
+        return {
+            **base_losses,
+            "loss": loss,
+            "loss_jepa": loss_jepa,
+            "pred_action_latent": pred_latent,
+            "target_action_latent": target_latent,
+        }
+    @torch.no_grad()
+    def sample(self, s2_tokens, state, state_history=None, action_history=None, steps=None, task_memory=None):
+        actions = self.online_s1.sample(s2_tokens, state, state_history, action_history, steps=steps, task_memory=task_memory)
+        if bool(self.config.use_jepa_action_residual) and float(self.config.jepa_action_alpha) != 0.0:
+            pred_latent = self.predict_action_latent(s2_tokens, state, action_history).to(actions.dtype)
+            delta = self.latent_to_action_delta(pred_latent).view(
+                actions.shape[0], self.config.action_horizon, self.config.action_dim
+            )
+            actions = actions + float(self.config.jepa_action_alpha) * delta
+            if self.config.action_clip > 0:
+                actions = actions.clamp(-self.config.action_clip, self.config.action_clip)
+        return actions
+class Rio2ResidualAdapter(nn.Module):
+    """Tiny correction head. Initial output is zero when residual_alpha=0."""
+    def __init__(self, config: Rio2Config):
+        super().__init__()
+        width = min(256, max(64, config.s1_width))
+        self.net = nn.Sequential(
+            nn.Linear(config.state_dim, width),
+            nn.SiLU(),
+            nn.Linear(width, config.action_horizon * config.action_dim),
+        )
+        self.config = config
+        nn.init.zeros_(self.net[-1].weight)
+        nn.init.zeros_(self.net[-1].bias)
+    def forward(self, state: torch.Tensor) -> torch.Tensor:
+        if state.ndim == 1:
+            state = state.unsqueeze(0)
+        delta = self.net(state).view(state.shape[0], self.config.action_horizon, self.config.action_dim)
+        return delta
+class Rio2PreTrainedModel(PreTrainedModel):
+    config_class = Rio2Config
+    base_model_prefix = "rio2"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["Rio2FastS1FlowActionExpert", "Rio2MolmoAct2Core"]
+    def _init_weights(self, module):
+        std = 0.02
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, Rio2RMSNorm):
+            module.weight.data.fill_(1.0)
+class Rio2Model(Rio2PreTrainedModel):
+    """RIO-2 weight-preserved SO101 policy integrated as a Transformers model."""
+    def __init__(self, config: Rio2Config):
+        super().__init__(config)
+        self.molmoact = Rio2MolmoAct2Core(config)
+        if bool(config.enable_jepa_s1):
+            self.s1_student = Rio2JepaS1ActionExpert(config)
+        else:
+            self.s1_student = Rio2FastS1FlowActionExpert(config)
+        self.residual_adapter = Rio2ResidualAdapter(config) if config.enable_residual_adapter else None
+        self._s2_cache: torch.Tensor | None = None
+        self._s2_cache_time: float = 0.0
+        self._cached_instruction: str | None = None
+        self._action_chunk_history: list[tuple[torch.Tensor, int]] = []
+        self._task_memory_cache: torch.Tensor | None = None
+        self.post_init()
+        if config.load_base_on_init:
+            logger.warning("config.load_base_on_init=True loads MolmoAct2 during construction; prefer load_s2_base().")
+            self.load_s2_base()
+        self.apply_finetuning_policy()
+    @property
+    def s2(self):
+        """Backward-compatible alias without duplicate module registration."""
+        return self.molmoact
+    @property
+    def s1(self):
+        """Backward-compatible alias without duplicate module registration."""
+        return self.s1_student
+    def load_s2_base(self, device: str | torch.device | None = None, device_map: str | None = None):
+        self.molmoact.load_base(device=device, device_map=device_map)
+        self.apply_finetuning_policy()
+        return self
+    def freeze_s2_base(self):
+        self.molmoact.freeze_base()
+        return self
+    @torch.no_grad()
+    def reset_temporal_ensemble(self):
+        self._action_chunk_history.clear()
+        return self
+    @torch.no_grad()
+    def reset_task_memory(self):
+        self._task_memory_cache = None
+        return self
+    @torch.no_grad()
+    def update_task_memory(self, s2_tokens: torch.Tensor, reset: bool = False):
+        if not bool(self.config.task_memory_enabled):
+            self._task_memory_cache = None
+            return None
+        device = next(self.s1_student.parameters()).device
+        dtype = next(self.s1_student.parameters()).dtype
+        if hasattr(self.s1_student, "default_task_memory_from_s2"):
+            candidate = self.s1_student.default_task_memory_from_s2(s2_tokens.to(device=device, dtype=dtype)).detach()
+        elif hasattr(self.s1_student, "online_s1"):
+            candidate = self.s1_student.online_s1.default_task_memory_from_s2(s2_tokens.to(device=device, dtype=dtype)).detach()
+        else:
+            return None
+        if (
+            reset
+            or self._task_memory_cache is None
+            or tuple(self._task_memory_cache.shape) != tuple(candidate.shape)
+        ):
+            memory = candidate
+        else:
+            memory = float(self.config.task_memory_ema) * self._task_memory_cache.to(device=device, dtype=dtype)
+            memory = memory + (1.0 - float(self.config.task_memory_ema)) * candidate
+        max_norm = float(self.config.task_memory_max_norm)
+        if max_norm > 0:
+            norms = memory.norm(dim=-1, keepdim=True).clamp_min(1e-6)
+            memory = memory * (max_norm / norms).clamp(max=1.0)
+        self._task_memory_cache = memory.detach()
+        return self._task_memory_cache
+    @torch.no_grad()
+    def _apply_temporal_ensemble(self, actions: torch.Tensor, enabled: bool | None = None) -> torch.Tensor:
+        use_ensemble = self.config.temporal_ensemble_enabled if enabled is None else enabled
+        if not use_ensemble or actions.ndim != 3:
+            return actions
+        if self._action_chunk_history and self._action_chunk_history[0][0].shape != actions.shape:
+            self.reset_temporal_ensemble()
+        aged = []
+        for chunk, age in self._action_chunk_history:
+            next_age = age + 1
+            if next_age < actions.shape[1]:
+                aged.append((chunk, next_age))
+        max_chunks = int(max(1, self.config.temporal_ensemble_max_chunks))
+        self._action_chunk_history = [(actions.detach(), 0)] + aged[: max_chunks - 1]
+        blended = []
+        for offset in range(actions.shape[1]):
+            weighted_sum = None
+            weight_sum = 0.0
+            for chunk, age in self._action_chunk_history:
+                idx = age + offset
+                if idx >= actions.shape[1]:
+                    continue
+                weight = math.exp(-float(self.config.temporal_ensemble_decay) * age)
+                value = chunk[:, idx]
+                weighted_sum = value * weight if weighted_sum is None else weighted_sum + value * weight
+                weight_sum += weight
+            blended.append(weighted_sum / max(weight_sum, 1e-8))
+        return torch.stack(blended, dim=1)
+    def apply_finetuning_policy(self):
+        """Apply the default small-tuning policy.
+        Base MolmoAct2 weights are frozen by default. Trainable parameters are
+        compressor/student/residual-adapter parameters, and optionally the
+        detected original action expert when the user explicitly unfreezes it.
+        """
+        if self.config.train_adapters_only:
+            if self.molmoact.base is not None:
+                self.molmoact.freeze_base()
+            for param in self.molmoact.compressor.parameters():
+                param.requires_grad = True
+            for param in self.s1_student.parameters():
+                param.requires_grad = True
+            if hasattr(self.s1_student, "freeze_target_encoder"):
+                self.s1_student.freeze_target_encoder()
+            if self.residual_adapter is not None:
+                for param in self.residual_adapter.parameters():
+                    param.requires_grad = bool(self.config.residual_trainable)
+        return self
+    def unfreeze_original_s1(self):
+        return self.molmoact.unfreeze_action_expert()
+    def trainable_parameter_names(self) -> list[str]:
+        return [name for name, param in self.named_parameters() if param.requires_grad]
+    @torch.no_grad()
+    def update_jepa_target_encoder(self, decay: float | None = None):
+        if hasattr(self.s1_student, "update_target_encoder"):
+            self.s1_student.update_target_encoder(decay=decay)
+        return self
+    @torch.no_grad()
+    def refresh_s2(self, images: ImageLike | list[ImageLike], instruction: str, force: bool = False) -> torch.Tensor:
+        tokens = self.molmoact.refresh_s2(images, instruction, force=force)
+        if instruction != self._cached_instruction or force:
+            self.reset_temporal_ensemble()
+            self.update_task_memory(tokens, reset=instruction != self._cached_instruction)
+        else:
+            self.update_task_memory(tokens, reset=False)
+        self._s2_cache = tokens.detach()
+        self._s2_cache_time = time.time()
+        self._cached_instruction = instruction
+        return self._s2_cache
+    @torch.no_grad()
+    def act_fast(
+        self,
+        state: torch.Tensor,
+        state_history: torch.Tensor | None = None,
+        action_history: torch.Tensor | None = None,
+        steps: int | None = None,
+        use_original: bool | None = None,
+        temporal_ensemble: bool | None = None,
+    ) -> torch.Tensor:
+        use_original = self.config.use_original_s1 if use_original is None else use_original
+        device = next(self.parameters()).device
+        state = state.to(device)
+        state_history = None if state_history is None else state_history.to(device)
+        action_history = None if action_history is None else action_history.to(device)
+        if use_original and self.molmoact.base is not None:
+            actions = self.molmoact.act_original(state, state_history, action_history, num_steps=steps)
+        else:
+            if self._s2_cache is None:
+                raise RuntimeError("S2 cache is empty. Call refresh_s2() or pass s2_tokens to forward().")
+            s2_tokens = self._s2_cache.to(device=device, dtype=state.dtype if state.is_floating_point() else torch.float32)
+            task_memory = None if self._task_memory_cache is None else self._task_memory_cache.to(device=device, dtype=s2_tokens.dtype)
+            actions = self.s1_student.sample(s2_tokens, state, state_history, action_history, steps=steps, task_memory=task_memory)
+        if self.residual_adapter is not None and float(self.config.residual_alpha) != 0.0:
+            actions = actions + float(self.config.residual_alpha) * self.residual_adapter(state).to(actions.dtype)
+        if self.config.action_clip > 0:
+            actions = actions.clamp(-self.config.action_clip, self.config.action_clip)
+        return self._apply_temporal_ensemble(actions, enabled=temporal_ensemble)
+    def forward_from_s2_tokens(
+        self,
+        s2_tokens: torch.Tensor,
+        state: torch.Tensor,
+        state_history: torch.Tensor | None = None,
+        action_history: torch.Tensor | None = None,
+        target_actions: torch.Tensor | None = None,
+        s1_steps: int | None = None,
+        task_memory: torch.Tensor | None = None,
+        return_dict: bool | None = None,
+    ) -> tuple[torch.Tensor] | Rio2Output:
+        return self.forward(
+            state=state,
+            s2_tokens=s2_tokens,
+            state_history=state_history,
+            action_history=action_history,
+            target_actions=target_actions,
+            s1_steps=s1_steps,
+            task_memory=task_memory,
+            return_dict=return_dict,
+            use_original=False,
+        )
+    def forward(
+        self,
+        state: torch.Tensor,
+        s2_tokens: torch.Tensor | None = None,
+        state_history: torch.Tensor | None = None,
+        action_history: torch.Tensor | None = None,
+        target_actions: torch.Tensor | None = None,
+        images: ImageLike | list[ImageLike] | None = None,
+        instruction: str | None = None,
+        refresh_s2: bool = False,
+        s1_steps: int | None = None,
+        task_memory: torch.Tensor | None = None,
+        use_original: bool | None = None,
+        return_dict: bool | None = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor] | Rio2Output:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_original = self.config.use_original_s1 if use_original is None else use_original
+        if refresh_s2:
+            if images is None or instruction is None:
+                raise ValueError("`images` and `instruction` are required when refresh_s2=True.")
+            s2_tokens = self.refresh_s2(images, instruction, force=True)
+        elif s2_tokens is None:
+            s2_tokens = self._s2_cache
+        device = next(self.parameters()).device
+        state = state.to(device)
+        state_history = None if state_history is None else state_history.to(device)
+        action_history = None if action_history is None else action_history.to(device)
+        # Training path: use cached-token/student path by default because the
+        # original MolmoAct2 action expert is usually frozen and remote-code
+        # signatures may not expose target-action training directly.
+        if target_actions is not None:
+            if s2_tokens is None:
+                raise ValueError("Training requires `s2_tokens` or refresh_s2=True.")
+            s2_tokens = s2_tokens.to(device=device, dtype=state.dtype if state.is_floating_point() else torch.float32)
+            task_memory = None if task_memory is None else task_memory.to(device=device, dtype=s2_tokens.dtype)
+            target_actions = target_actions.to(device=device, dtype=state.dtype if state.is_floating_point() else torch.float32)
+            losses = self.s1_student.training_loss(s2_tokens, state, state_history, action_history, target_actions, task_memory=task_memory)
+            output = Rio2Output(
+                loss=losses["loss"],
+                s2_tokens=s2_tokens,
+                loss_flow_mse=losses["loss_flow_mse"],
+                loss_flow_l1=losses["loss_flow_l1"],
+                loss_diffusion=losses.get("loss_diffusion"),
+                loss_consistency=losses.get("loss_consistency"),
+                loss_smooth=losses["loss_smooth"],
+                loss_jepa=losses.get("loss_jepa"),
+                loss_jepa_prior=losses.get("loss_jepa_prior"),
+                pred_action_latent=losses.get("pred_action_latent"),
+                target_action_latent=losses.get("target_action_latent"),
+                runtime_path="jepa_s1_training" if "loss_jepa" in losses else "student_adapter_training",
+            )
+            return tuple(v for v in output.to_tuple() if v is not None) if not return_dict else output
+        if use_original and self.molmoact.base is not None:
+            actions = self.act_fast(state, state_history, action_history, steps=s1_steps, use_original=True)
+            runtime_path = self.molmoact.last_runtime_path
+            tokens = self._s2_cache
+        else:
+            if s2_tokens is None:
+                raise ValueError("Pass `s2_tokens`, call refresh_s2(), or set refresh_s2=True.")
+            s2_tokens = s2_tokens.to(device=device, dtype=state.dtype if state.is_floating_point() else torch.float32)
+            if task_memory is None and self._task_memory_cache is not None:
+                task_memory = self._task_memory_cache
+            task_memory = None if task_memory is None else task_memory.to(device=device, dtype=s2_tokens.dtype)
+            actions = self.s1_student.sample(s2_tokens, state, state_history, action_history, steps=s1_steps, task_memory=task_memory)
+            if self.residual_adapter is not None and float(self.config.residual_alpha) != 0.0:
+                actions = actions + float(self.config.residual_alpha) * self.residual_adapter(state).to(actions.dtype)
+            runtime_path = "student_cached_tokens"
+            tokens = s2_tokens
+        output = Rio2Output(actions=actions, s2_tokens=tokens, runtime_path=runtime_path)
+        return (actions, tokens) if not return_dict else output
+__all__ = [
+    "Rio2Model",
+    "Rio2PreTrainedModel",
+]

processing_rio2.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright 2026 The HuggingFace Inc. team and the Rio2 contributors.
+# Licensed under the Apache License, Version 2.0.
+"""Processor for Rio2."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+from transformers.processing_utils import ProcessorMixin
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Rio2Processor(ProcessorMixin):
+    attributes = []
+    optional_attributes = []
+    def __init__(self, base_processor=None, base_model_id: str | None = None, **kwargs):
+        self.base_processor = base_processor
+        self.base_model_id = base_model_id
+        self.chat_template = kwargs.pop("chat_template", None)
+    @classmethod
+    def from_base_model_id(cls, base_model_id: str, **kwargs):
+        from transformers import AutoProcessor
+        base_processor = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True, **kwargs)
+        return cls(base_processor=base_processor, base_model_id=base_model_id)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        path = Path(pretrained_model_name_or_path)
+        base_model_id = kwargs.pop("base_model_id", None)
+        load_base_processor = bool(kwargs.pop("load_base_processor", False))
+        hub_kwargs = {
+            key: kwargs.get(key)
+            for key in ["cache_dir", "force_download", "proxies", "token", "revision", "local_files_only", "subfolder"]
+            if key in kwargs
+        }
+        if path.exists():
+            cfg_path = path / "processor_config.json"
+            model_cfg_path = path / "config.json"
+            if cfg_path.exists():
+                data = json.loads(cfg_path.read_text(encoding="utf-8"))
+                base_model_id = base_model_id or data.get("base_model_id")
+            if base_model_id is None and model_cfg_path.exists():
+                data = json.loads(model_cfg_path.read_text(encoding="utf-8"))
+                base_model_id = data.get("base_model_id")
+        else:
+            try:
+                from transformers.utils import cached_file
+                cfg_file = cached_file(pretrained_model_name_or_path, "processor_config.json", **hub_kwargs)
+                if cfg_file:
+                    data = json.loads(Path(cfg_file).read_text(encoding="utf-8"))
+                    base_model_id = base_model_id or data.get("base_model_id")
+            except Exception as exc:
+                logger.debug("Could not load RIO-2 processor config from Hub: %s", exc)
+            if base_model_id is None:
+                try:
+                    from transformers.utils import cached_file
+                    cfg_file = cached_file(pretrained_model_name_or_path, "config.json", **hub_kwargs)
+                    if cfg_file:
+                        data = json.loads(Path(cfg_file).read_text(encoding="utf-8"))
+                        base_model_id = data.get("base_model_id")
+                except Exception as exc:
+                    logger.debug("Could not load RIO-2 model config from Hub: %s", exc)
+        base_processor = None
+        if base_model_id and load_base_processor:
+            try:
+                from transformers import AutoProcessor
+                trust_remote_code = kwargs.pop("trust_remote_code", True)
+                base_processor = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=trust_remote_code, **kwargs)
+            except Exception as exc:
+                logger.warning("Could not load base processor %s: %s", base_model_id, exc)
+        return cls(base_processor=base_processor, base_model_id=base_model_id)
+    def save_pretrained(self, save_directory, **kwargs):
+        out = Path(save_directory)
+        out.mkdir(parents=True, exist_ok=True)
+        data = {
+            "processor_class": self.__class__.__name__,
+            "base_model_id": self.base_model_id,
+            "auto_map": {"AutoProcessor": "processing_rio2.Rio2Processor"},
+        }
+        (out / "processor_config.json").write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8")
+        if self.base_processor is not None and kwargs.pop("save_base_processor", False):
+            base_dir = out / "base_processor"
+            self.base_processor.save_pretrained(base_dir)
+        return [str(out / "processor_config.json")]
+    def __call__(
+        self,
+        images=None,
+        instruction: str | None = None,
+        state: Any | None = None,
+        state_history: Any | None = None,
+        action_history: Any | None = None,
+        target_actions: Any | None = None,
+        **kwargs,
+    ) -> dict[str, Any]:
+        out: dict[str, Any] = {}
+        if self.base_processor is not None and images is not None and instruction is not None:
+            out.update(self.base_processor(images=images, text=instruction, return_tensors="pt", **kwargs))
+        else:
+            if images is not None:
+                out["images"] = images
+            if instruction is not None:
+                out["instruction"] = instruction
+        if state is not None:
+            out["state"] = state
+        if state_history is not None:
+            out["state_history"] = state_history
+        if action_history is not None:
+            out["action_history"] = action_history
+        if target_actions is not None:
+            out["target_actions"] = target_actions
+        return out
+__all__ = ["Rio2Processor"]

processor_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "processor_class": "Rio2Processor",
+  "base_model_id": "allenai/MolmoAct2-SO100_101",
+  "auto_map": {
+    "AutoProcessor": "processing_rio2.Rio2Processor"
+  }
+}

rio2_export_manifest.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "repo_mode": "custom_code",
+  "single_weight_file": "model.safetensors",
+  "config_file": "config.json",
+  "custom_code_files": [
+    "configuration_rio2.py",
+    "modeling_rio2.py",
+    "processing_rio2.py"
+  ],
+  "repo_id": "hoguai/RIO-2"
+}

runtime_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "base_model_id": "__merged_in_model_safetensors__",
+  "local_base": true,
+  "single_weight_file": "model.safetensors",
+  "s1_expanded": true,
+  "s1_moe_finetuned": true,
+  "requires_finetune_after_expansion": false
+}