File size: 37,261 Bytes

e94400c

#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#  This file is a StarVLA-local variant of the action encoder/decoder.
#  It keeps the overall structure but replaces the decoder with a
#  flow-matching based decoder (velocity prediction) and injects timestep
#  conditioning into RMSNorm (AdaRMSNorm) in the decoder.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨

import sys
sys.path.append("/mnt/data/fangyu/code/reward_new")

import math
from typing import List

import numpy as np
import torch
import torch.distributed as dist
import torch.nn.functional as F
from torch import nn
from torch.distributions import Beta

from transformers.processing_utils import Unpack
from transformers.utils import TransformersKwargs, logging

from starVLA.model.modules.action_model.ActionModel import (
    Qwen3Attention,
    Qwen3MLP,
    Qwen3RMSNorm,
    Qwen3RotaryEmbedding,
    ActionPreTrainedModel,
)
from starVLA.model.modules.action_model.configuration_actionmodel import ActionModelConfig
from starVLA.model.tools import FRAMEWORK_REGISTRY

logger = logging.get_logger(__name__)


class _GradientReversalFunction(torch.autograd.Function):
    """
    Forward: identity. Backward: scale gradient by -lambda (inverse gradient).
    Used for domain adversarial training so the encoder receives reversed gradient
    and is encouraged to produce domain-invariant embeddings.
    """

    @staticmethod
    def forward(ctx, x: torch.Tensor, lambda_: float) -> torch.Tensor:
        ctx.lambda_ = lambda_
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output: torch.Tensor):
        return -ctx.lambda_ * grad_output, None


def _timestep_embedding(t: torch.Tensor, dim: int, max_period: float = 10000.0) -> torch.Tensor:
    """
    Standard sinusoidal timestep embedding.
    Args:
        t: (B,) float tensor, typically in [0, 1].
    Returns:
        (B, dim)
    """
    if t.ndim != 1:
        raise ValueError(f"Expected `t` to have shape (B,), got {tuple(t.shape)}")
    half = dim // 2
    freqs = torch.exp(
        -math.log(max_period) * torch.arange(0, half, device=t.device, dtype=torch.float32) / max(half, 1)
    )
    args = t.to(torch.float32)[:, None] * freqs[None]
    emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
    if dim % 2 == 1:
        emb = torch.cat([emb, torch.zeros((emb.shape[0], 1), device=t.device, dtype=emb.dtype)], dim=-1)
    return emb.to(dtype=t.dtype)


class Qwen3AdaRMSNorm(nn.Module):
    """
    RMSNorm + timestep conditioning.

    y = RMSNorm(x) * (1 + scale(t)) + shift(t)
    """

    def __init__(self, hidden_size: int, cond_size: int, eps: float = 1e-6) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
        self.cond_mlp = nn.Sequential(
            nn.SiLU(),
            nn.Linear(cond_size, 2 * hidden_size, bias=True),
        )

    def forward(self, hidden_states: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
        if cond is None:
            raise ValueError("Qwen3AdaRMSNorm requires `cond` but got None.")
        if cond.ndim != 2:
            raise ValueError(f"Expected `cond` to have shape (B, C), got {tuple(cond.shape)}")

        input_dtype = hidden_states.dtype
        x = hidden_states.to(torch.float32)
        variance = x.pow(2).mean(-1, keepdim=True)
        x = x * torch.rsqrt(variance + self.variance_epsilon)
        x = self.weight * x.to(input_dtype)

        scale, shift = self.cond_mlp(cond).chunk(2, dim=-1)
        return x * (1 + scale[:, None, :]) + shift[:, None, :]

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


class Qwen3LayerFM(nn.Module):
    """
    Same block structure as `Qwen3Layer`, but decoder-side RMSNorms are timestep-conditioned.
    Attention/MLP are unchanged.
    """

    def __init__(self, config: ActionModelConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx)
        self.mlp = Qwen3MLP(config)
        self.input_layernorm = Qwen3AdaRMSNorm(config.hidden_size, cond_size=config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = Qwen3AdaRMSNorm(
            config.hidden_size, cond_size=config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        temb: torch.Tensor,
        attention_mask: torch.Tensor | None = None,
        position_ids: torch.LongTensor | None = None,
        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
        **kwargs,
    ) -> torch.Tensor:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states, temb)
        hidden_states, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            position_embeddings=position_embeddings,
            **kwargs,
        )
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states, temb)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
        return hidden_states


class ActionModelFM(ActionPreTrainedModel):
    """
    Flow-matching based decoder variant for StarVLA `ActionModel`.
    Encoder stays the same; decoder predicts velocity under linear interpolation noise.
    """

    def __init__(self, config: ActionModelConfig):
        super().__init__(config)
        self.config = config

        # ===== tokens / embeddings (same as original) =====
        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
        self.action_mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        self.state_mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))

        self.dataset_embed = nn.Embedding(
            config.dataset_vocab_size,
            config.hidden_size * config.num_data_tokens,
        )

        self.action_proj_in = nn.Linear(config.action_size, config.hidden_size)
        self.state_proj_in = nn.Linear(config.state_size, config.hidden_size)
        self.use_state = config.use_state
        print(f"use_state: {self.use_state}")

        # ===== encoder (unchanged blocks) =====
        # Reuse the original Qwen3Layer implementation from ActionModel.py through `ActionPreTrainedModel` machinery
        from starVLA.model.modules.action_model.ActionModel import Qwen3Layer  # local import
        self.action_encoder = nn.ModuleList([Qwen3Layer(config, layer_idx) for layer_idx in range(config.num_encoder_layers)])

        # ===== decoder (FM) =====
        self.action_decoder = nn.ModuleList([Qwen3LayerFM(config, layer_idx) for layer_idx in range(config.num_decoder_layers)])
        self.norm = Qwen3AdaRMSNorm(config.hidden_size, cond_size=config.hidden_size, eps=config.rms_norm_eps)
        self.action_proj_out = nn.Linear(config.hidden_size, config.action_size)

        self.rotary_emb = Qwen3RotaryEmbedding(config=config)
        self.gradient_checkpointing = False

        # ===== FM hyperparams =====
        self.fm_time_min = float(getattr(config, "fm_time_min", 0.001))
        self.fm_time_max = float(getattr(config, "fm_time_max", 0.999))
        self.fm_num_inference_steps = int(getattr(config, "fm_num_inference_steps", 10))
        self.fm_time_sampling = str(getattr(config, "fm_time_sampling", "uniform"))  # "uniform" | "beta"
        self.fm_beta_alpha = float(getattr(config, "fm_beta_alpha", 1.5))
        self.fm_beta_beta = float(getattr(config, "fm_beta_beta", 1.0))
        self._beta_dist = Beta(self.fm_beta_alpha, self.fm_beta_beta)

        # timestep -> temb (B,H)
        self.fm_timestep_mlp = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size * 4, bias=True),
            nn.SiLU(),
            nn.Linear(config.hidden_size * 4, config.hidden_size, bias=True),
        )

        # ===== Loss mode: masked-action recon =====
        self.use_masked_action_recon = bool(getattr(config, "use_masked_action_recon", False))
        self.post_init()

        self._maybe_init_from_qwen3()


    def _sample_fm_time(self, batch_size: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
        if self.fm_time_sampling == "beta":
            t = self._beta_dist.sample([batch_size]).to(device=device, dtype=dtype)
        else:
            t = torch.rand((batch_size,), device=device, dtype=dtype)
        t = t * (self.fm_time_max - self.fm_time_min) + self.fm_time_min
        return t

    def _fm_temb(self, t: torch.Tensor) -> torch.Tensor:
        return self.fm_timestep_mlp(_timestep_embedding(t, self.config.hidden_size))

    def _gather_embeddings(self, x: torch.Tensor) -> tuple[torch.Tensor, int]:
        """
        Gather embeddings from all ranks.
        Returns (gathered_tensor, offset) where offset is the start index of this rank's data in the global batch.
        Single-GPU: returns (x, 0).
        """
        if not (self.contrastive_use_distributed and dist.is_initialized() and dist.get_world_size() > 1):
            return x, 0
        world_size = dist.get_world_size()
        local_size = x.shape[0]
        size_list = [torch.tensor([0], dtype=torch.long, device=x.device) for _ in range(world_size)]
        dist.all_gather(size_list, torch.tensor([local_size], dtype=torch.long, device=x.device))
        sizes = [s.item() for s in size_list]
        max_size = max(sizes)
        offset = sum(sizes[: dist.get_rank()])
        if max_size > local_size:
            padding = torch.zeros(max_size - local_size, x.shape[1], device=x.device, dtype=x.dtype)
            x = torch.cat([x, padding], dim=0)
        gather_list = [torch.zeros_like(x) for _ in range(world_size)]
        dist.all_gather(gather_list, x)
        out = torch.cat([g[: sizes[i]] for i, g in enumerate(gather_list)], dim=0)
        return out, offset

    def random_masking(self, x: torch.Tensor, mask_ratio: float | torch.Tensor):
        """
        MAE-style per-sample random masking by shuffling (argsort noise).

        This version DOES NOT drop tokens; it returns `x_masked` with the same shape as `x`,
        where masked positions are replaced by `self.action_mask_token`.

        Args:
            x: [N, L, D]
            mask_ratio: float in [0, 1) OR tensor of shape [N] with per-sample ratios

        Returns:
            x_masked: [N, L, D]
            mask: [N, L] (0=keep, 1=mask)
            ids_restore: [N, L]
        """
        N, L, D = x.shape
        token_dim = int(self.action_mask_token.shape[-1])
        if D != token_dim:
            raise ValueError(
                f"`random_masking` expects last dim D=={token_dim} (same as action_mask_token), got D=={D}."
            )
        if isinstance(mask_ratio, torch.Tensor):
            if mask_ratio.ndim != 1 or mask_ratio.shape[0] != N:
                raise ValueError(
                    f"When `mask_ratio` is a tensor it must have shape (N,), got {tuple(mask_ratio.shape)}"
                )
            # clamp to safe range
            mask_ratio = mask_ratio.to(device=x.device, dtype=torch.float32).clamp(min=0.0, max=0.999)
            len_keep = torch.floor(L * (1.0 - mask_ratio)).to(dtype=torch.long)  # (N,)
        else:
            mr = float(mask_ratio)
            mr = max(0.0, min(0.999, mr))
            len_keep = int(L * (1.0 - mr))

        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is mask
        ids_restore = torch.argsort(ids_shuffle, dim=1)

        # generate the binary mask: 0 is keep, 1 is mask
        mask = torch.ones([N, L], device=x.device, dtype=torch.float32)
        if isinstance(len_keep, torch.Tensor):
            # build mask in shuffled order then unshuffle
            keep = torch.arange(L, device=x.device)[None, :].expand(N, L) < len_keep[:, None]  # (N,L)
            mask = (~keep).to(dtype=torch.float32)
        else:
            mask[:, :len_keep] = 0
        mask = torch.gather(mask, dim=1, index=ids_restore)  # unshuffle

        # replace masked tokens with action_mask_token (keep sequence length)
        mask_token = self.action_mask_token.expand(N, L, -1).to(dtype=x.dtype, device=x.device)
        x_masked = x * (1.0 - mask[:, :, None]) + mask[:, :, None] * mask_token

        return x_masked, mask, ids_restore

    def random_masking_interleaved(
        self,
        interleaved: torch.Tensor,
        mask_ratio: float | torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        MAE-style random masking for interleaved [state_0, action_0, state_1, action_1, ...].
        Positions 0, 2, 4, ... are state (replaced with state_mask_token when masked);
        positions 1, 3, 5, ... are action (replaced with action_mask_token when masked).

        Args:
            interleaved: [N, 2*L, D] (state, action, state, action, ...)
            mask_ratio: float in [0, 1) OR tensor [N] per-sample

        Returns:
            x_masked: [N, 2*L, D]
            mask: [N, 2*L] (0=keep, 1=mask)
            ids_restore: [N, 2*L]
        """
        N, two_L, D = interleaved.shape
        L = two_L // 2
        if isinstance(mask_ratio, torch.Tensor):
            mask_ratio = mask_ratio.to(device=interleaved.device, dtype=torch.float32).clamp(min=0.0, max=0.999)
            len_keep = torch.floor(two_L * (1.0 - mask_ratio)).to(dtype=torch.long)
        else:
            mr = max(0.0, min(0.999, float(mask_ratio)))
            len_keep = int(two_L * (1.0 - mr))

        noise = torch.rand(N, two_L, device=interleaved.device)
        ids_shuffle = torch.argsort(noise, dim=1)
        ids_restore = torch.argsort(ids_shuffle, dim=1)

        if isinstance(len_keep, torch.Tensor):
            keep = torch.arange(two_L, device=interleaved.device)[None, :].expand(N, two_L) < len_keep[:, None]
            mask = (~keep).to(dtype=torch.float32)
        else:
            mask = torch.ones(N, two_L, device=interleaved.device, dtype=torch.float32)
            mask[:, :len_keep] = 0
        mask = torch.gather(mask, dim=1, index=ids_restore)

        state_mtk = self.state_mask_token.expand(N, two_L, -1).to(dtype=interleaved.dtype, device=interleaved.device)
        action_mtk = self.action_mask_token.expand(N, two_L, -1).to(dtype=interleaved.dtype, device=interleaved.device)
        # even indices -> state, odd -> action
        state_pos = torch.zeros(two_L, device=interleaved.device, dtype=torch.float32)
        state_pos[0::2] = 1.0
        state_pos = state_pos.view(1, two_L, 1)
        action_pos = 1.0 - state_pos
        mask_expand = mask[:, :, None]
        replacement = mask_expand * (state_pos * state_mtk + action_pos * action_mtk)
        x_masked = interleaved * (1.0 - mask_expand) + replacement
        return x_masked, mask, ids_restore

    # --- copied optional init helper from original ---
    def _maybe_init_from_qwen3(self) -> None:
        from transformers import AutoModel

        name_or_path = getattr(self.config, "qwen3_pretrained_name_or_path", None)
        if not name_or_path:
            return

        pretrained = AutoModel.from_pretrained(
            name_or_path,
            torch_dtype="auto",
            low_cpu_mem_usage=True,
        )

        src_sd = pretrained.state_dict()
        layer_prefix = None
        for p in ("model.layers.", "layers."):
            if any(k.startswith(p) for k in src_sd.keys()):
                layer_prefix = p
                break

        norm_prefix = None
        for p in ("model.norm.", "norm."):
            if any(k.startswith(p) for k in src_sd.keys()):
                norm_prefix = p
                break

        if layer_prefix is None:
            return

        def _map_layer_key(target_key: str, module_prefix: str, layer_offset: int) -> str | None:
            rest = target_key[len(module_prefix) + 1 :]
            parts = rest.split(".", 1)
            if len(parts) != 2:
                return None
            try:
                tgt_idx = int(parts[0])
            except ValueError:
                return None
            src_idx = tgt_idx + int(layer_offset)
            return f"{layer_prefix}{src_idx}.{parts[1]}"

        own_sd = self.state_dict()
        to_load: dict[str, torch.Tensor] = {}
        matched = 0
        missing = 0
        shape_mismatch = 0

        init_enc = bool(getattr(self.config, "qwen3_init_action_encoder", True))
        init_dec = bool(getattr(self.config, "qwen3_init_action_decoder", True))
        init_norm = bool(getattr(self.config, "qwen3_init_norm", True))
        enc_off = int(getattr(self.config, "qwen3_encoder_layer_offset", 0))
        dec_off = int(getattr(self.config, "qwen3_decoder_layer_offset", 0))

        # NOTE: decoder has AdaRMSNorm (extra cond_mlp weights), but many weights still match:
        # - action_decoder.*.self_attn.*
        # - action_decoder.*.mlp.*
        # - action_decoder.*.(input_layernorm|post_attention_layernorm).weight  (load RMS weight only)
        # - norm.weight (load RMS weight only)
        for k, tgt_tensor in own_sd.items():
            src_key = None
            if init_enc and k.startswith("action_encoder."):
                src_key = _map_layer_key(k, "action_encoder", enc_off)
            elif init_dec and k.startswith("action_decoder."):
                # Skip timestep-conditioned MLP weights (no counterpart in Qwen3)
                if ".cond_mlp." in k:
                    continue
                src_key = _map_layer_key(k, "action_decoder", dec_off)
            elif init_norm and k == "norm.weight" and norm_prefix is not None:
                src_key = f"{norm_prefix}weight"

            if not src_key:
                continue
            src_tensor = src_sd.get(src_key, None)
            if src_tensor is None:
                missing += 1
                continue
            if src_tensor.shape != tgt_tensor.shape:
                shape_mismatch += 1
                continue

            to_load[k] = src_tensor.to(device=tgt_tensor.device, dtype=tgt_tensor.dtype)
            matched += 1

        self.load_state_dict(to_load, strict=False)
        print(
            f"Initialized from Qwen3 checkpoint {name_or_path}. "
            f"matched={matched} missing={missing} shape_mismatch={shape_mismatch} prefix={layer_prefix}"
        )

        if matched == 0:
            # Most common culprit: config dims don't match Qwen3 checkpoint.
            src_cfg = getattr(pretrained, "config", None)
            if src_cfg is not None:
                fields = [
                    "hidden_size",
                    "intermediate_size",
                    "num_hidden_layers",
                    "num_attention_heads",
                    "num_key_value_heads",
                    "head_dim",
                    "rms_norm_eps",
                ]
                diffs = []
                for f in fields:
                    if hasattr(src_cfg, f) and hasattr(self.config, f):
                        a = getattr(self.config, f)
                        b = getattr(src_cfg, f)
                        if a != b:
                            diffs.append((f, a, b))
                if diffs:
                    print("[ActionModelFM] Qwen3 init got 0 matches. Config differs from checkpoint:")
                    for f, a, b in diffs:
                        print(f"  - {f}: ActionModelConfig={a} vs Qwen3={b}")

    def forward(
        self,
        examples: List[dict] = None,
        **kwargs: Unpack[TransformersKwargs],
    ):
        device = next(self.parameters()).device
        batch_size = len(examples)

        # =========================================================================
        # 1. Variable-length Horizon (same as original)
        # =========================================================================
        raw_actions = torch.tensor(
            np.array([ex["action"] for ex in examples]),
            device=device,
            dtype=torch.float32,
        )  # [B, L, D]

        use_state = self.use_state
        raw_states = None
        if use_state:
            raw_states = torch.tensor(
                np.array([ex["state"] for ex in examples]),
                device=device,
                dtype=torch.float32,
            )  # [B, L, state_dim]

        # =========================================================================
        # 2. Action (and optional State) Input Construction & Masking (DAE)
        # Encoder sequence: cls, dataset_tokens, [state_0, action_0, state_1, action_1, ...]
        # Two-view (masked + clean) when use_masked_action_recon.
        # =========================================================================
        with torch.autocast("cuda", dtype=torch.float32):
            clean_action_embeds = self.action_proj_in(raw_actions)  # [B, L, H]
            if use_state:
                clean_state_embeds = self.state_proj_in(raw_states)  # [B, L, H]
                # Interleave: [s0, a0, s1, a1, ...] -> [B, 2*L, H]
                clean_inputs_embeds = torch.stack(
                    [clean_state_embeds, clean_action_embeds], dim=2
                ).reshape(batch_size, 2 * raw_actions.shape[1], -1)
            else:
                clean_inputs_embeds = clean_action_embeds

            masked_inputs_embeds = clean_inputs_embeds
            if self.use_masked_action_recon:
                if use_state:
                    if getattr(self.config, "mask_ratio_mode", "fixed") == "uniform_per_traj":
                        mr_min = float(getattr(self.config, "mask_ratio_min", self.config.mask_ratio))
                        mr_max = float(getattr(self.config, "mask_ratio_max", self.config.mask_ratio))
                        per_traj_mr = torch.rand((batch_size,), device=device) * (mr_max - mr_min) + mr_min
                        masked_inputs_embeds, _, _ = self.random_masking_interleaved(clean_inputs_embeds, per_traj_mr)
                    else:
                        masked_inputs_embeds, _, _ = self.random_masking_interleaved(
                            clean_inputs_embeds, float(self.config.mask_ratio)
                        )
                else:
                    if getattr(self.config, "mask_ratio_mode", "fixed") == "uniform_per_traj":
                        mr_min = float(getattr(self.config, "mask_ratio_min", self.config.mask_ratio))
                        mr_max = float(getattr(self.config, "mask_ratio_max", self.config.mask_ratio))
                        per_traj_mr = torch.rand((batch_size,), device=device) * (mr_max - mr_min) + mr_min
                        masked_inputs_embeds, _, _ = self.random_masking(clean_inputs_embeds, per_traj_mr)
                    else:
                        masked_inputs_embeds, _, _ = self.random_masking(clean_inputs_embeds, float(self.config.mask_ratio))

            # =========================================================================
            # 3. Dataset Soft Prompt (same as original)
            # =========================================================================
            dataset_ids = [ex.get("dataset_id") for ex in examples]
            dataset_ids_tensor = torch.tensor(dataset_ids, device=device, dtype=torch.long)
            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
                batch_size, self.config.num_data_tokens, self.config.hidden_size
            )

            cls_token_expanded = self.cls_token.expand(batch_size, -1, -1)
            encoder_inputs_clean = torch.cat((cls_token_expanded, ds_embeds, clean_inputs_embeds), dim=1)
            encoder_inputs_masked = torch.cat((cls_token_expanded, ds_embeds, masked_inputs_embeds), dim=1)

            seq_len = encoder_inputs_clean.shape[1]
            enc_bs = batch_size * 2 if self.use_masked_action_recon else batch_size
            encoder_attention_mask = torch.ones((enc_bs, 1, seq_len, seq_len), device=device, dtype=torch.bool)
            encoder_pos_ids = torch.arange(seq_len, device=device).unsqueeze(0)
            # rotary embeddings are position-based; we keep position_ids batch=1 and broadcast.
            enc_pos_emb = self.rotary_emb(encoder_inputs_clean, encoder_pos_ids)

            hidden_states = (
                torch.cat((encoder_inputs_masked, encoder_inputs_clean), dim=0)
                if self.use_masked_action_recon
                else encoder_inputs_clean
            )
            for encoder_layer in self.action_encoder:
                hidden_states = encoder_layer(
                    hidden_states,
                    attention_mask=encoder_attention_mask,
                    position_embeddings=enc_pos_emb,
                    position_ids=encoder_pos_ids,
                    **kwargs,
                )

            if self.use_masked_action_recon:
                hidden_masked, hidden_clean = hidden_states.chunk(2, dim=0)
                action_embedding_masked = F.normalize(hidden_masked[:, :1, :], p=2, dim=-1)
                action_embedding_clean = F.normalize(hidden_clean[:, :1, :], p=2, dim=-1)
            else:
                action_embedding_clean = F.normalize(hidden_states[:, :1, :], p=2, dim=-1)
                action_embedding_masked = None

            # =========================================================================
            # 4. Flow-matching Decoder
            # =========================================================================
            t = self._sample_fm_time(batch_size, device=device, dtype=raw_actions.dtype)  # (B,)
            noise = torch.randn_like(raw_actions)
            noisy_actions = t[:, None, None] * noise + (1 - t[:, None, None]) * raw_actions
            target_velocity = noise - raw_actions

            noisy_embeds = self.action_proj_in(noisy_actions)
            if self.use_masked_action_recon:
                # Single decoder forward for both views in one batch.
                decoder_cond = torch.cat((action_embedding_clean, action_embedding_masked), dim=0)
                noisy_embeds = torch.cat((noisy_embeds, noisy_embeds), dim=0)
                t = torch.cat((t, t), dim=0)
                target_velocity = torch.cat((target_velocity, target_velocity), dim=0)
            else:
                decoder_cond = action_embedding_clean

            decoder_inputs = torch.cat((decoder_cond, noisy_embeds), dim=1)  # [B or 2B, 1+L, H]

            dec_seq_len = decoder_inputs.shape[1]
            dec_bs = decoder_inputs.shape[0]
            decoder_attention_mask = torch.ones((dec_bs, 1, dec_seq_len, dec_seq_len), device=device, dtype=torch.bool)
            dec_pos_ids = torch.arange(dec_seq_len, device=device).unsqueeze(0)
            dec_pos_emb = self.rotary_emb(decoder_inputs, dec_pos_ids)
            temb = self._fm_temb(t)

            hidden_states = decoder_inputs
            for decoder_layer in self.action_decoder:
                hidden_states = decoder_layer(
                    hidden_states,
                    temb=temb,
                    attention_mask=decoder_attention_mask,
                    position_embeddings=dec_pos_emb,
                    position_ids=dec_pos_ids,
                )

            hidden_states = self.norm(hidden_states, temb)
            pred_velocity = self.action_proj_out(hidden_states[:, 1:, :])

            if self.use_masked_action_recon:
                pred_clean, pred_masked = pred_velocity.chunk(2, dim=0)
                target_clean, target_masked = target_velocity.chunk(2, dim=0)
                recon_loss_clean = F.mse_loss(pred_clean, target_clean)
                recon_loss_masked = F.mse_loss(pred_masked, target_masked)
                recon_loss = 0.5 * (recon_loss_clean + recon_loss_masked)
            else:
                recon_loss = F.mse_loss(pred_velocity, target_velocity)
            return recon_loss

    def recon_loss(self, actions, dataset_ids: list[int], state=None, **kwargs):
        """
        Same interface as `ActionModel.recon_loss`, but using flow-matching decoder loss.

        Args:
            actions: (B, L, action_dim)
            dataset_ids: list[int]; used for dataset soft prompt when state is provided.
            state: optional (B, L, state_dim); if provided and state_proj_in exists,
                   encoder sees interleaved sequence [state_0, action_0, state_1, action_1, ...].
        Returns:
            scalar loss
        """
        # Optional fast-path: pass a precomputed action embedding to avoid another encoder forward.
        action_embedding = kwargs.pop("action_embedding", None)
        t = kwargs.pop("t", None)
        noise = kwargs.pop("noise", None)

        if action_embedding is None:
            action_embedding = self.encode_actions(actions, dataset_ids, state, **kwargs)

        return self.recon_loss_from_embedding(
            action_embedding=action_embedding,
            actions=actions,
            t=t,
            noise=noise,
        )

    def recon_loss_from_embedding(
        self,
        action_embedding: torch.Tensor,
        actions: torch.Tensor,
        t: torch.Tensor | None = None,
        noise: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """
        Flow-matching velocity loss conditioned on a provided action embedding.

        This is the preferred interface when you already have an action embedding (e.g., from VLM projector),
        since it avoids an extra action-encoder forward.

        Args:
            action_embedding: (B, H) or (B, 1, H), assumed L2-normalized (recommended).
            actions: (B, L, action_dim)
            t: optional (B,) time; if None sample internally
            noise: optional (B, L, action_dim) noise; if None sample internally
        """
        if action_embedding.dim() == 2:
            action_embedding = action_embedding.unsqueeze(1)
        if action_embedding.dim() != 3 or action_embedding.shape[1] != 1:
            raise ValueError(f"Expected action_embedding shape (B,1,H) or (B,H); got {tuple(action_embedding.shape)}")

        batch_size = actions.shape[0]
        device = actions.device
        dtype = actions.dtype

        if t is None:
            t = self._sample_fm_time(batch_size, device=device, dtype=dtype)
        if noise is None:
            noise = torch.randn_like(actions)

        noisy_actions = t[:, None, None] * noise + (1 - t[:, None, None]) * actions
        target_velocity = noise - actions

        temb = self._fm_temb(t)
        action_embeds = self.action_proj_in(noisy_actions)
        hidden_states = torch.cat((action_embedding, action_embeds), dim=1)

        dec_seq_len = hidden_states.shape[1]
        decoder_attention_mask = torch.ones(
            (batch_size, 1, dec_seq_len, dec_seq_len),
            device=device,
            dtype=torch.bool,
        )
        dec_pos_ids = torch.arange(dec_seq_len, device=device).unsqueeze(0)
        dec_pos_emb = self.rotary_emb(hidden_states, dec_pos_ids)

        for decoder_layer in self.action_decoder:
            hidden_states = decoder_layer(
                hidden_states,
                temb=temb,
                attention_mask=decoder_attention_mask,
                position_embeddings=dec_pos_emb,
                position_ids=dec_pos_ids,
            )

        hidden_states = self.norm(hidden_states, temb)
        pred_velocity = self.action_proj_out(hidden_states[:, 1:, :])
        return F.mse_loss(pred_velocity, target_velocity)

    def encode_actions(self, actions, dataset_ids: list[int], state=None, **kwargs):
        """
        Encode action chunk (and optional state chunk) to a single CLS embedding.

        Args:
            actions: (B, L, action_dim)
            state: optional (B, L, state_dim); if provided and state_proj_in exists,
                   encoder sees interleaved sequence [state_0, action_0, state_1, action_1, ...].
            dataset_ids: list[int]; used for dataset soft prompt when state is provided.
        """
        action_embeds = self.action_proj_in(actions)
        batch_size = action_embeds.shape[0]
        use_state = state is not None and self.state_proj_in is not None
        if use_state:
            state_embeds = self.state_proj_in(state)
            L = action_embeds.shape[1]
            inputs_embeds = torch.stack(
                [state_embeds, action_embeds], dim=2
            ).reshape(batch_size, 2 * L, -1)
        else:
            inputs_embeds = action_embeds

        cls_token_expanded = self.cls_token.expand(batch_size, -1, -1)

        dataset_ids_tensor = torch.tensor(dataset_ids, device=action_embeds.device, dtype=torch.long)
        ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
            batch_size, self.config.num_data_tokens, self.config.hidden_size
        )
        inputs_embeds = torch.cat((cls_token_expanded, ds_embeds, inputs_embeds), dim=1)

        seq_len = inputs_embeds.shape[1]
        encoder_attention_mask = torch.ones(
            (batch_size, 1, seq_len, seq_len),
            device=inputs_embeds.device,
            dtype=torch.bool,
        )
        encoder_pos_ids = torch.arange(seq_len, device=inputs_embeds.device).unsqueeze(0)
        enc_pos_emb = self.rotary_emb(inputs_embeds, encoder_pos_ids)

        hidden_states = inputs_embeds
        for encoder_layer in self.action_encoder:
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask=encoder_attention_mask,
                position_embeddings=enc_pos_emb,
                position_ids=encoder_pos_ids,
                **kwargs,
            )

        action_embedding = hidden_states[:, :1, :]
        return F.normalize(action_embedding, p=2, dim=-1)

    @torch.no_grad()
    def decode_actions(self, action_embedding, chunk_size, **kwargs):
        """
        FM sampling via simple Euler integration of the learned velocity field.
        """
        if chunk_size is None:
            chunk_size = self.config.max_action_chunk_size

        if action_embedding.dim() == 2:
            action_embedding = action_embedding.unsqueeze(1)

        batch_size = action_embedding.shape[0]
        device = action_embedding.device
        dtype = action_embedding.dtype

        actions = torch.randn((batch_size, chunk_size, self.config.action_size), device=device, dtype=dtype)
        num_steps = max(int(self.fm_num_inference_steps), 1)
        dt = -1.0 / float(num_steps)

        for step in range(num_steps):
            t = torch.full((batch_size,), 1.0 - step / float(num_steps), device=device, dtype=dtype)
            temb = self._fm_temb(t)

            action_embeds = self.action_proj_in(actions)
            hidden_states = torch.cat((action_embedding, action_embeds), dim=1)

            dec_seq_len = hidden_states.shape[1]
            decoder_attention_mask = torch.ones((batch_size, 1, dec_seq_len, dec_seq_len), device=device, dtype=torch.bool)
            dec_pos_ids = torch.arange(dec_seq_len, device=device).unsqueeze(0)
            dec_pos_emb = self.rotary_emb(hidden_states, dec_pos_ids)

            for decoder_layer in self.action_decoder:
                hidden_states = decoder_layer(
                    hidden_states,
                    temb=temb,
                    attention_mask=decoder_attention_mask,
                    position_embeddings=dec_pos_emb,
                    position_ids=dec_pos_ids,
                )

            hidden_states = self.norm(hidden_states, temb)
            pred_velocity = self.action_proj_out(hidden_states[:, 1:, :])
            actions = actions + dt * pred_velocity

        return actions


__all__ = [
    "ActionModelFM",
]

if __name__ == "__main__":
    config = ActionModelConfig()
    action_model = ActionModelFM(config)
    print(action_model)

    print("Total number of DiT parameters: ",
        sum(p.numel() for p in action_model.parameters() if p.requires_grad))

    fake_actions = torch.randn(10, 15, 64).to("cuda:7")

    sample = {
        "action": np.random.uniform(-1, 1, size=(16, 29)).astype(np.float16),  # action_chunk, action_dim (unified 29D)
        "lang": "put the ball on the table",
    }

    batch = [sample, sample]
    device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
    action_model = action_model.to(device)

    outputs = action_model(batch)
    print(outputs)

    action_embedding = action_model.encode_actions(fake_actions)
    print(f"action_embedding: {action_embedding.shape}")

    reconstructed_actions = action_model.decode_actions(action_embedding, chunk_size=15)
    print(f"reconstructed_actions: {reconstructed_actions.shape}")