Upload 10 files

Browse files

Files changed (5) hide show

eval_sigma_vla_rollout.py +1402 -0
modeling_pi05.py +1264 -0
patch_sigma_env.py +432 -0
pi05_embed_tie.patch +19 -0
requirements.txt +33 -0

eval_sigma_vla_rollout.py ADDED Viewed

	@@ -0,0 +1,1402 @@

+# eval_sigma_vla_rollout.py
+# Offline closed-loop evaluation for Telepathy-augmented VLA on top of PI05 policy backbone.
+#
+# Key design:
+# - base_model_id is a LeRobot/OpenPI policy repo (e.g., lerobot/pi05_base or your fine-tuned Sigma repo).
+# - We load PI05Policy via LeRobot, NOT AutoModelForCausalLM.
+# - Text embeddings are taken from the PI05 internal text backbone so that TelepathyLanguageModule
+#   receives the same type of inputs used during training.
+#
+# Hardened in this revision:
+# - Robust recursive shard discovery under any naming & subfolders.
+# - Shard content structure normalization (list-of-samples, or dict{samples/data}).
+# - Collate auto-adapts to real schema: vision/state/action/text, with time-dim collapse for vision.
+# - Action GT supports dict-style branches or a single tensor.
+# - Metrics tolerate missing multi-branch outputs (fallback to "action").
+# - Text tokens dtype/device aligned to model dtype for mixed precision safety.
+# - Robot state time-dim collapse + pad/trim to state encoder expected dim.
+# - Dynamic projection to align vision/state token hidden size to vision backbone dim (768),
+#   and project text to the same dim BEFORE feeding language module.
+# - Optional max_text_len to avoid tokenizer truncation warnings.
+# - action input contract hardening:
+#       * high_level_rep 2D -> 3D
+#       * tau None/2D -> 3D
+#       * tau length aligned to high_level_rep length
+#       * tau last-dim auto pad/trim so concat(high_level_rep, tau) matches action_condition_proj in_features
+# - tokenizer_id can be a LOCAL path; when it exists locally we load with local_files_only
+# - _align_target handles 2D<->3D mismatches (fixes MSE crashes)
+# - remove duplicated "high_level_rep/tau re-normalization" that overwrote the hardening
+#
+# NEW in this patch:
+# - cosine_alignment auto-aligns hidden sizes (fixes 256 vs 2048 crash).
+# - semantic pooling guard supports 2D/3D factors safely.
+# - alignment metric ignores zero-length cases robustly.
+#
+# EXTRA HARDENING (this patch for your baseline issue):
+# - Try strict load for PI05Policy if the LeRobot version supports it.
+# - Verify tokenizer vocab size and special-token ids match PI05 text embedding table.
+# - Fail fast with a clear message if mismatch is detected (unless explicitly overridden).
+#
+# NEW in this hard-set patch:
+# - Per-sample MSE is exposed from success proxy.
+# - A "hard set" is defined as samples whose branch-wise MSE exceeds hard thresholds.
+# - Hard-set averages (MSE and fraction of samples) are reported alongside global metrics.
+#
+# NEW in this adapter patch:
+# - sigma_telepathy_adapter is applied at eval time (when telepathy is enabled) to gate
+#   Telepathy residuals based on their magnitude and tau strength, optionally using
+#   offline base_action_* if present in the shards.
+from __future__ import annotations
+import os
+import glob
+import json
+import argparse
+import importlib
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from dotenv import load_dotenv
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+try:
+    from huggingface_hub import snapshot_download
+except Exception:
+    snapshot_download = None  # type: ignore
+from vision_sigma_vla import TelepathyVisionModule, VisionConfig
+from language_sigma_vla import TelepathyLanguageModule, LanguageConfig
+from action_sigma_vla import TelepathyActionModule, ActionConfig
+from sigma_telepathy_adapter import SigmaTelepathyAdapter, SigmaTelepathyAdapterConfig
+def ensure_sigma_artifacts_from_hf(
+    repo_id: str,
+    hf_token: Optional[str],
+    local_cache_root: str,
+) -> Dict[str, str]:
+    """
+    Download Sigma artifacts from HF repo into a local cache folder.
+    Returns local paths for shard_dir and telepathy_heads_path.
+    We only pull:
+      storage/sigma_pickplace/**
+      storage/sigma_lora_out/**
+    """
+    if snapshot_download is None:
+        raise ImportError(
+            "huggingface_hub is not available but auto-download was requested. "
+            "Please `pip install huggingface_hub` or download artifacts manually."
+        )
+    os.makedirs(local_cache_root, exist_ok=True)
+    local_dir = snapshot_download(
+        repo_id=repo_id,
+        token=hf_token,
+        local_dir=os.path.join(local_cache_root, repo_id.replace("/", "__")),
+        local_dir_use_symlinks=False,
+        allow_patterns=[
+            "storage/sigma_pickplace/**",
+            "storage/sigma_lora_out/**",
+        ],
+    )
+    shard_dir = os.path.join(local_dir, "storage", "sigma_pickplace")
+    telepathy_heads_path = os.path.join(
+        local_dir, "storage", "sigma_lora_out", "sigma_telepathy_heads.pt"
+    )
+    return {
+        "local_repo_dir": local_dir,
+        "shard_dir": shard_dir,
+        "telepathy_heads_path": telepathy_heads_path,
+    }
+def load_pi05_policy(
+    repo_id: str,
+    hf_token: Optional[str],
+    device: torch.device,
+    strict_load: bool = True,
+):
+    """
+    Load PI05Policy from LeRobot. We try a few import paths to be robust across versions.
+    If the LeRobot PI05Policy.from_pretrained supports strict loading, we enable it.
+    """
+    policy_cls = None
+    import_errors = []
+    candidate_paths = [
+        ("lerobot.policies.pi05.modeling_pi05", "PI05Policy"),
+        ("lerobot.policies.pi05", "PI05Policy"),
+    ]
+    for mod_name, cls_name in candidate_paths:
+        try:
+            mod = importlib.import_module(mod_name)
+            policy_cls = getattr(mod, cls_name)
+            break
+        except Exception as e:
+            import_errors.append(f"{mod_name}.{cls_name}: {type(e).__name__}: {e}")
+    if policy_cls is None:
+        raise ImportError(
+            "Failed to import PI05Policy from LeRobot. Tried:\n  - "
+            + "\n  - ".join(import_errors)
+        )
+    policy = None
+    tried = []
+    if strict_load:
+        try:
+            policy = policy_cls.from_pretrained(repo_id, token=hf_token, strict=True)
+            tried.append("from_pretrained(..., strict=True)")
+        except TypeError:
+            tried.append("strict=True not supported")
+        except Exception as e:
+            tried.append(f"strict=True failed: {type(e).__name__}: {e}")
+    if policy is None:
+        try:
+            policy = policy_cls.from_pretrained(repo_id, token=hf_token)
+            tried.append("from_pretrained(repo_id, token=...)")
+        except TypeError:
+            policy = policy_cls.from_pretrained(pretrained_name_or_path=repo_id, token=hf_token)
+            tried.append("from_pretrained(pretrained_name_or_path=..., token=...)")
+    if policy is None:
+        raise RuntimeError("PI05Policy loading returned None. Tried: " + "; ".join(tried))
+    policy = policy.to(device)
+    policy.eval()
+    return policy
+def get_policy_tokenizer(
+    policy,
+    repo_id: str,
+    hf_token: Optional[str],
+    forced_tokenizer_id: str = "",
+):
+    """
+    Robust tokenizer getter for PI05Policy.
+    IMPORTANT:
+    - Never call AutoTokenizer.from_pretrained(repo_id) because repo_id is a policy repo.
+    - If --tokenizer_id is provided and points to a LOCAL folder, load locally.
+    - Otherwise load from HF id.
+    - If still missing, recursively search for tokenizer/processor inside policy.
+    """
+    from transformers import AutoTokenizer
+    if forced_tokenizer_id:
+        if os.path.exists(forced_tokenizer_id):
+            tok = AutoTokenizer.from_pretrained(
+                forced_tokenizer_id,
+                local_files_only=True,
+                trust_remote_code=True,
+            )
+        else:
+            tok = AutoTokenizer.from_pretrained(
+                forced_tokenizer_id,
+                token=hf_token,
+                trust_remote_code=True,
+            )
+        if tok.pad_token is None:
+            tok.pad_token = tok.eos_token
+        return tok
+    def _recursive_find_tokenizer(obj, max_depth: int = 4):
+        if obj is None or max_depth <= 0:
+            return None
+        for key in ["tokenizer", "processor", "text_tokenizer", "language_tokenizer"]:
+            if hasattr(obj, key):
+                v = getattr(obj, key)
+                if v is None:
+                    continue
+                if key == "processor" and hasattr(v, "tokenizer") and v.tokenizer is not None:
+                    return v.tokenizer
+                if hasattr(v, "__call__"):
+                    return v
+        nested_names = [
+            "paligemma_with_expert",
+            "paligemma",
+            "gemma_expert",
+            "language_model",
+            "text_model",
+            "model",
+            "policy",
+        ]
+        for name in nested_names:
+            if hasattr(obj, name):
+                found = _recursive_find_tokenizer(
+                    getattr(obj, name), max_depth=max_depth - 1
+                )
+                if found is not None:
+                    return found
+        return None
+    tok = _recursive_find_tokenizer(policy)
+    if tok is not None:
+        if getattr(tok, "pad_token", None) is None and getattr(tok, "eos_token", None) is not None:
+            tok.pad_token = tok.eos_token
+        return tok
+    backbone_name = None
+    config_candidates = []
+    for attr in ["config", "model", "paligemma_with_expert", "paligemma"]:
+        if hasattr(policy, attr):
+            config_candidates.append(getattr(policy, attr))
+    def _try_get_name(cfg_obj):
+        if cfg_obj is None:
+            return None
+        for k in [
+            "_name_or_path",
+            "text_backbone_id",
+            "text_model_id",
+            "language_model_id",
+            "processor_name_or_path",
+            "tokenizer_name_or_path",
+        ]:
+            if hasattr(cfg_obj, k):
+                v = getattr(cfg_obj, k)
+                if isinstance(v, str) and v:
+                    return v
+        if hasattr(cfg_obj, "config"):
+            c = getattr(cfg_obj, "config")
+            if hasattr(c, "_name_or_path") and isinstance(c._name_or_path, str) and c._name_or_path:
+                return c._name_or_path
+        return None
+    for c in config_candidates:
+        backbone_name = _try_get_name(c)
+        if backbone_name:
+            break
+    if backbone_name:
+        tok = AutoTokenizer.from_pretrained(
+            backbone_name, token=hf_token, trust_remote_code=True
+        )
+        if tok.pad_token is None:
+            tok.pad_token = tok.eos_token
+        return tok
+    raise ValueError(
+        f"Cannot obtain tokenizer from PI05Policy for repo '{repo_id}'. "
+        "Your lerobot PI05 port does not expose tokenizer/processor nor backbone name. "
+        "Please pass --tokenizer_id explicitly."
+    )
+def get_policy_text_embedding_layer(policy):
+    """
+    Locate the text embedding layer inside PI05Policy robustly.
+    """
+    def _recursive_find(obj, depth: int = 6):
+        if obj is None or depth <= 0:
+            return None
+        if hasattr(obj, "get_input_embeddings"):
+            try:
+                emb = obj.get_input_embeddings()
+                if emb is not None:
+                    return emb
+            except Exception:
+                pass
+        for key in ["embed_tokens", "embeddings", "token_embedding"]:
+            if hasattr(obj, key):
+                v = getattr(obj, key)
+                if isinstance(v, nn.Module):
+                    return v
+        nested_names = [
+            "model",
+            "paligemma_with_expert",
+            "paligemma",
+            "language_model",
+            "gemma_expert",
+            "text_model",
+            "policy",
+        ]
+        for name in nested_names:
+            if hasattr(obj, name):
+                found = _recursive_find(getattr(obj, name), depth=depth - 1)
+                if found is not None:
+                    return found
+        return None
+    emb = _recursive_find(policy)
+    if emb is None:
+        raise AttributeError(
+            "Cannot locate PI05 text embedding layer via recursive search. "
+            "Your PI05Policy likely changed internal naming. "
+            "Please inspect policy.model.* to confirm embed_tokens location."
+        )
+    return emb
+def verify_tokenizer_embedding_compat(
+    tokenizer,
+    text_embed_layer: nn.Module,
+    allow_mismatch: bool = False,
+):
+    """
+    Ensure tokenizer vocab/special ids are consistent with PI05 text embedding table.
+    This directly prevents the 'embed_tokens.weight missing or misaligned' baseline issue.
+    """
+    emb_vocab = None
+    if isinstance(text_embed_layer, nn.Embedding):
+        emb_vocab = int(text_embed_layer.num_embeddings)
+    elif hasattr(text_embed_layer, "weight") and text_embed_layer.weight is not None:
+        emb_vocab = int(text_embed_layer.weight.size(0))
+    tok_vocab = getattr(tokenizer, "vocab_size", None)
+    if tok_vocab is None:
+        try:
+            tok_vocab = len(tokenizer)
+        except Exception:
+            tok_vocab = None
+    if emb_vocab is None or tok_vocab is None:
+        print("[WARN] Cannot infer tokenizer/embedding vocab sizes. Skipping compatibility check.")
+        return
+    if emb_vocab != tok_vocab:
+        msg = (
+            f"[ERROR] Tokenizer vocab size ({tok_vocab}) != PI05 embedding table size ({emb_vocab}). "
+            "This will corrupt text embeddings and invalidate baseline. "
+            "Fix by passing --tokenizer_id matching the PI05 text backbone "
+            "(e.g., the original openpi/PI05 tokenizer) or re-exporting policy with aligned vocab."
+        )
+        if allow_mismatch:
+            print(msg.replace("[ERROR]", "[WARN]") + " Proceeding due to --allow_tokenizer_mismatch.")
+        else:
+            raise ValueError(msg)
+    for name in ["pad_token_id", "eos_token_id", "bos_token_id", "unk_token_id"]:
+        tid = getattr(tokenizer, name, None)
+        if tid is None:
+            continue
+        if not (0 <= int(tid) < emb_vocab):
+            msg = (
+                f"[ERROR] Tokenizer {name}={tid} out of embedding range [0, {emb_vocab-1}]. "
+                "Your tokenizer does not belong to this PI05 backbone."
+            )
+            if allow_mismatch:
+                print(msg.replace("[ERROR]", "[WARN]") + " Proceeding due to --allow_tokenizer_mismatch.")
+            else:
+                raise ValueError(msg)
+class TelepathyVLA(nn.Module):
+    """
+    Full model matching your final arrows.
+    """
+    def __init__(
+        self,
+        v_cfg: VisionConfig,
+        l_cfg: LanguageConfig,
+        a_cfg: ActionConfig,
+        disable_telepathy: bool = False,
+    ):
+        super().__init__()
+        self.vision = TelepathyVisionModule(v_cfg)
+        self.language = TelepathyLanguageModule(l_cfg)
+        self.action = TelepathyActionModule(a_cfg)
+        self.disable_telepathy = disable_telepathy
+        self.register_buffer("_m_prev", None, persistent=False)
+        self._proj_inited = False
+        self.text_proj: Optional[nn.Module] = None
+        self.vision_proj: Optional[nn.Module] = None
+        self.state_proj: Optional[nn.Module] = None
+    def reset_memory(self):
+        self._m_prev = None
+    @torch.no_grad()
+    def forward_once(
+        self,
+        vis_obs: torch.Tensor,
+        robot_state: torch.Tensor,
+        text_tokens: torch.Tensor,
+        depth_obs: Optional[torch.Tensor] = None,
+        audio_obs: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        return_intermediate: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        vis0 = self.vision(
+            vis_obs=vis_obs,
+            robot_state=robot_state,
+            depth_obs=depth_obs,
+            audio_obs=audio_obs,
+            telepathy_factors=None,
+            return_intermediate=return_intermediate,
+        )
+        vis_d = vis0["vision_tokens"].size(-1)
+        state_d = vis0["state_tokens"].size(-1)
+        target_d = vis_d
+        if not self._proj_inited:
+            self.text_proj = nn.Linear(text_tokens.size(-1), target_d, bias=False) \
+                if text_tokens.size(-1) != target_d else nn.Identity()
+            self.vision_proj = nn.Identity() if vis_d == target_d else nn.Linear(vis_d, target_d, bias=False)
+            self.state_proj = nn.Identity() if state_d == target_d else nn.Linear(state_d, target_d, bias=False)
+            self.text_proj = self.text_proj.to(device=text_tokens.device, dtype=text_tokens.dtype)
+            self.vision_proj = self.vision_proj.to(device=text_tokens.device, dtype=text_tokens.dtype)
+            self.state_proj = self.state_proj.to(device=text_tokens.device, dtype=text_tokens.dtype)
+            self._proj_inited = True
+        assert self.text_proj is not None and self.vision_proj is not None and self.state_proj is not None
+        text_tokens = self.text_proj(text_tokens)
+        vision_tokens = self.vision_proj(vis0["vision_tokens"])
+        state_tokens = self.state_proj(vis0["state_tokens"])
+        lang_out = self.language(
+            text_tokens=text_tokens,
+            vision_tokens=vision_tokens,
+            state_tokens=state_tokens,
+            m_prev=self._m_prev,
+            attn_mask=attn_mask,
+            return_intermediate=return_intermediate,
+        )
+        raw_tau = lang_out.get("telepathy_factors", None)
+        self._m_prev = lang_out.get("m_t", None)
+        telepathy_scale = float(getattr(self, "telepathy_scale", 1.0))
+        if self.disable_telepathy:
+            tau = None
+            vis_out = vis0
+        else:
+            tau = raw_tau
+            if tau is not None:
+                tau = tau * telepathy_scale
+            vis_out = self.vision(
+                vis_obs=vis_obs,
+                robot_state=robot_state,
+                depth_obs=depth_obs,
+                audio_obs=audio_obs,
+                telepathy_factors=tau,
+                return_intermediate=return_intermediate,
+            )
+        high_level_rep = lang_out.get("high_level_rep", None)
+        if high_level_rep is None:
+            raise KeyError("language output missing 'high_level_rep'.")
+        if high_level_rep.dim() == 2:
+            high_level_rep = high_level_rep.unsqueeze(1)
+        if tau is None:
+            B, L, _ = high_level_rep.shape
+            tau_dim = getattr(self.language, "tau_dim", 128)
+            tau = torch.zeros(B, L, tau_dim, device=high_level_rep.device, dtype=high_level_rep.dtype)
+        else:
+            if tau.dim() == 2:
+                tau = tau.unsqueeze(1)
+            if tau.size(1) != high_level_rep.size(1):
+                L = high_level_rep.size(1)
+                if tau.size(1) == 1:
+                    tau = tau.expand(-1, L, -1)
+                else:
+                    tau = tau[:, :L, :]
+        expected_in = None
+        acp = getattr(self.action, "action_condition_proj", None)
+        if acp is not None:
+            if hasattr(acp, "in_features"):
+                expected_in = int(acp.in_features)
+            elif hasattr(acp, "net") and len(acp.net) > 0 and hasattr(acp.net[0], "in_features"):
+                expected_in = int(acp.net[0].in_features)
+        if expected_in is not None:
+            d_high = high_level_rep.size(-1)
+            target_tau = expected_in - d_high
+            if target_tau <= 0:
+                pass
+            else:
+                if tau.size(-1) < target_tau:
+                    tau = F.pad(tau, (0, target_tau - tau.size(-1)))
+                elif tau.size(-1) > target_tau:
+                    tau = tau[..., :target_tau]
+        state_for_action = vis_out["state_tokens"]
+        if state_for_action.dim() == 2:
+            state_for_action = state_for_action.unsqueeze(1)
+        elif state_for_action.dim() > 3:
+            state_for_action = state_for_action.view(
+                state_for_action.size(0), -1, state_for_action.size(-1)
+            )
+        lang_d = high_level_rep.size(-1)
+        def _pad_or_trim_to(x: torch.Tensor, d: int) -> torch.Tensor:
+            cur_d = x.size(-1)
+            if cur_d == d:
+                return x
+            if cur_d < d:
+                return F.pad(x, (0, d - cur_d))
+            return x[..., :d]
+        state_for_action = _pad_or_trim_to(state_for_action, lang_d)
+        act_out = self.action(
+            high_level_rep=high_level_rep,
+            telepathy_factors=tau,
+            state_tokens=state_for_action,
+            return_intermediate=return_intermediate,
+        )
+        out: Dict[str, torch.Tensor] = {}
+        out.update(vis_out)
+        out.update(lang_out)
+        out.update(act_out)
+        return out
+class SigmaShardDataset(Dataset):
+    """
+    Loads .pt shards produced by dataset_preprocess_sigma_vla.py.
+    Each shard is a list of dict samples OR a dict containing a list (samples/data).
+    """
+    def __init__(self, shard_dir: str):
+        super().__init__()
+        if not os.path.isdir(shard_dir):
+            raise FileNotFoundError(
+                f"shard_dir does not exist: {shard_dir}. Double-check the path."
+            )
+        patterns = [
+            os.path.join(shard_dir, "sigma_vla_shard_*.pt"),
+            os.path.join(shard_dir, "*.pt"),
+            os.path.join(shard_dir, "**", "*.pt"),
+        ]
+        paths: List[str] = []
+        for p in patterns:
+            paths.extend(glob.glob(p, recursive=True))
+        self.shard_paths = sorted(list(set(paths)))
+        if len(self.shard_paths) == 0:
+            raise FileNotFoundError(
+                f"No .pt shards found under {shard_dir}. "
+                "Your HF cache is empty or shards are not tracked by LFS."
+            )
+        print(f"[INFO] Found {len(self.shard_paths)} shard files. Example: {self.shard_paths[:3]}")
+        self.index_map: List[Tuple[int, int]] = []
+        self._shard_cache: Dict[int, List[Dict[str, Any]]] = {}
+        for sid, p in enumerate(self.shard_paths):
+            shard = torch.load(p, map_location="cpu")
+            shard_list = self._normalize_shard(shard, p)
+            for lid in range(len(shard_list)):
+                self.index_map.append((sid, lid))
+        self.total = len(self.index_map)
+    def __len__(self):
+        return self.total
+    def _normalize_shard(self, shard_obj: Any, path: str) -> List[Dict[str, Any]]:
+        if isinstance(shard_obj, (list, tuple)):
+            return list(shard_obj)
+        if isinstance(shard_obj, dict):
+            for k in ["samples", "data", "items"]:
+                if k in shard_obj and isinstance(shard_obj[k], (list, tuple)):
+                    return list(shard_obj[k])
+        raise TypeError(
+            f"Unsupported shard format in {path}. "
+            f"Expected list/tuple of samples or dict{{samples/data}}. "
+            f"Got type: {type(shard_obj).__name__}"
+        )
+    def _get_shard(self, sid: int) -> List[Dict[str, Any]]:
+        if sid not in self._shard_cache:
+            raw = torch.load(self.shard_paths[sid], map_location="cpu")
+            self._shard_cache[sid] = self._normalize_shard(raw, self.shard_paths[sid])
+        return self._shard_cache[sid]
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        sid, lid = self.index_map[idx]
+        shard = self._get_shard(sid)
+        return shard[lid]
+def collate_sigma(batch_list: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """
+    Robust collate for Sigma shards.
+    """
+    s0 = batch_list[0]
+    def pick_key(sample: Dict[str, Any], candidates: List[str], field_name: str):
+        for k in candidates:
+            if k in sample:
+                return k
+        raise KeyError(
+            f"Shard sample missing required field '{field_name}'. "
+            f"Tried keys: {candidates}. "
+            f"Available keys: {list(sample.keys())}"
+        )
+    if "vision" in s0:
+        vis_k = "vision"
+    else:
+        vis_k = pick_key(s0, ["vis_obs", "rgb_obs", "image", "images", "obs"], "vision/vis_obs")
+    vis_obs = torch.stack([b[vis_k] for b in batch_list], dim=0).float()
+    if vis_obs.dim() == 5:
+        vis_obs = vis_obs[:, -1]
+    depth_obs = None
+    if "depth" in s0:
+        depth_obs = torch.stack([b["depth"] for b in batch_list], dim=0).float()
+    elif any(k in s0 for k in ["depth_obs", "depths"]):
+        dk = pick_key(s0, ["depth_obs", "depths"], "depth")
+        depth_obs = torch.stack([b[dk] for b in batch_list], dim=0).float()
+    audio_obs = None
+    if "audio" in s0:
+        audio_obs = torch.stack([b["audio"] for b in batch_list], dim=0).float()
+    elif any(k in s0 for k in ["audio_obs", "audios"]):
+        ak = pick_key(s0, ["audio_obs", "audios"], "audio")
+        audio_obs = torch.stack([b[ak] for b in batch_list], dim=0).float()
+    if "state" in s0:
+        state_k = "state"
+    else:
+        state_k = pick_key(s0, ["robot_state", "proprio", "proprio_obs"], "state/robot_state")
+    robot_state = torch.stack([b[state_k] for b in batch_list], dim=0).float()
+    if "text" in s0:
+        texts = [b.get("text", "") for b in batch_list]
+    else:
+        text_k = pick_key(s0, ["text", "prompt", "instruction"], "text")
+        texts = [b.get(text_k, "") for b in batch_list]
+    if "action" in s0:
+        a0 = s0["action"]
+        if isinstance(a0, dict):
+            def pick_action_key(d, candidates, name):
+                for k in candidates:
+                    if k in d:
+                        return k
+                raise KeyError(
+                    f"Action dict missing '{name}'. Tried {candidates}. "
+                    f"Available action keys: {list(d.keys())}"
+                )
+            vec_k = pick_action_key(a0, ["gt_action_vector", "action_vector", "vector", "vec"], "gt_action_vector")
+            chk_k = pick_action_key(a0, ["gt_action_chunk", "action_chunk", "chunk", "chk"], "gt_action_chunk")
+            trj_k = pick_action_key(a0, ["gt_action_trajectory", "action_trajectory", "trajectory", "traj"], "gt_action_trajectory")
+            gt_action_vector = torch.stack([b["action"][vec_k] for b in batch_list], dim=0).float()
+            gt_action_chunk = torch.stack([b["action"][chk_k] for b in batch_list], dim=0).float()
+            gt_action_trajectory = torch.stack([b["action"][trj_k] for b in batch_list], dim=0).float()
+        else:
+            act = torch.stack([b["action"] for b in batch_list], dim=0).float()
+            gt_action_vector = act
+            gt_action_chunk = act
+            gt_action_trajectory = act
+    else:
+        gt_vec_k = pick_key(s0, ["gt_action_vector", "action_vector", "gt_vec"], "gt_action_vector")
+        gt_chk_k = pick_key(s0, ["gt_action_chunk", "action_chunk", "gt_chunk"], "gt_action_chunk")
+        gt_trj_k = pick_key(s0, ["gt_action_trajectory", "action_trajectory", "gt_traj"], "gt_action_trajectory")
+        gt_action_vector = torch.stack([b[gt_vec_k] for b in batch_list], dim=0).float()
+        gt_action_chunk = torch.stack([b[gt_chk_k] for b in batch_list], dim=0).float()
+        gt_action_trajectory = torch.stack([b[gt_trj_k] for b in batch_list], dim=0).float()
+    # Optional offline base actions for adapter; if missing, we simply do not include them.
+    base_action_vector = None
+    base_action_chunk = None
+    base_action_trajectory = None
+    has_base_top = any(
+        k in s0
+        for k in ["base_action_vector", "base_action_chunk", "base_action_trajectory"]
+    )
+    has_base_in_action = "action" in s0 and isinstance(s0["action"], dict) and any(
+        k in s0["action"]
+        for k in ["base_action_vector", "base_action_chunk", "base_action_trajectory"]
+    )
+    if has_base_top:
+        if "base_action_vector" in s0:
+            base_action_vector = torch.stack([b["base_action_vector"] for b in batch_list], dim=0).float()
+        if "base_action_chunk" in s0:
+            base_action_chunk = torch.stack([b["base_action_chunk"] for b in batch_list], dim=0).float()
+        if "base_action_trajectory" in s0:
+            base_action_trajectory = torch.stack([b["base_action_trajectory"] for b in batch_list], dim=0).float()
+    elif has_base_in_action:
+        a0 = s0["action"]
+        def pick_base_key(d, candidates):
+            for k in candidates:
+                if k in d:
+                    return k
+            return None
+        vec_bk = pick_base_key(a0, ["base_action_vector", "base_vec"])
+        chk_bk = pick_base_key(a0, ["base_action_chunk", "base_chunk"])
+        trj_bk = pick_base_key(a0, ["base_action_trajectory", "base_traj"])
+        if vec_bk is not None:
+            base_action_vector = torch.stack([b["action"][vec_bk] for b in batch_list], dim=0).float()
+        if chk_bk is not None:
+            base_action_chunk = torch.stack([b["action"][chk_bk] for b in batch_list], dim=0).float()
+        if trj_bk is not None:
+            base_action_trajectory = torch.stack([b["action"][trj_bk] for b in batch_list], dim=0).float()
+    batch: Dict[str, Any] = {
+        "vis_obs": vis_obs,
+        "depth_obs": depth_obs,
+        "audio_obs": audio_obs,
+        "robot_state": robot_state,
+        "texts": texts,
+        "gt_action_vector": gt_action_vector,
+        "gt_action_chunk": gt_action_chunk,
+        "gt_action_trajectory": gt_action_trajectory,
+    }
+    if base_action_vector is not None:
+        batch["base_action_vector"] = base_action_vector
+    if base_action_chunk is not None:
+        batch["base_action_chunk"] = base_action_chunk
+    if base_action_trajectory is not None:
+        batch["base_action_trajectory"] = base_action_trajectory
+    return batch
+def _align_target(pred_t: torch.Tensor, gt_t: torch.Tensor) -> torch.Tensor:
+    """
+    Align GT to prediction for MSE:
+    - handle 2D vs 3D mismatches by collapsing or expanding time dimension.
+    - then align last-dim by pad/trim.
+    """
+    if gt_t.dim() == 3 and pred_t.dim() == 2:
+        gt_t = gt_t[:, -1, :]
+    if pred_t.dim() == 3 and gt_t.dim() == 2:
+        gt_t = gt_t.unsqueeze(1)
+        if gt_t.size(1) != pred_t.size(1):
+            gt_t = gt_t.expand(-1, pred_t.size(1), -1)
+    if pred_t.dim() == 3 and gt_t.dim() == 3:
+        Tp = pred_t.size(1)
+        Tg = gt_t.size(1)
+        if Tg < Tp:
+            pad = torch.zeros(
+                gt_t.size(0), Tp - Tg, gt_t.size(2),
+                device=gt_t.device, dtype=gt_t.dtype
+            )
+            gt_t = torch.cat([gt_t, pad], dim=1)
+        elif Tg > Tp:
+            gt_t = gt_t[:, :Tp, :]
+    pd = pred_t.size(-1)
+    gd = gt_t.size(-1)
+    if gd < pd:
+        gt_t = F.pad(gt_t, (0, pd - gd))
+    elif gd > pd:
+        gt_t = gt_t[..., :pd]
+    return gt_t
+def _pred_action(pred: Dict[str, torch.Tensor], key: str) -> torch.Tensor:
+    if key in pred:
+        return pred[key]
+    if "action" in pred:
+        return pred["action"]
+    raise KeyError(
+        f"Pred dict missing action key '{key}' and fallback 'action'. "
+        f"Available pred keys: {list(pred.keys())}"
+    )
+@torch.no_grad()
+def compute_branch_mse(pred: Dict[str, torch.Tensor], batch: Dict[str, Any]) -> Dict[str, float]:
+    vec_pred = _pred_action(pred, "action_vector")
+    chk_pred = _pred_action(pred, "action_chunk")
+    trj_pred = _pred_action(pred, "action_trajectory")
+    device = vec_pred.device
+    gt_vec = _align_target(vec_pred, batch["gt_action_vector"].to(device))
+    gt_chk = _align_target(chk_pred, batch["gt_action_chunk"].to(device))
+    gt_trj = _align_target(trj_pred, batch["gt_action_trajectory"].to(device))
+    mse_vec = F.mse_loss(vec_pred, gt_vec).item()
+    mse_chk = F.mse_loss(chk_pred, gt_chk).item()
+    mse_trj = F.mse_loss(trj_pred, gt_trj).item()
+    return {"mse_vector": mse_vec, "mse_chunk": mse_chk, "mse_traj": mse_trj}
+@torch.no_grad()
+def compute_success_proxy(
+    pred: Dict[str, torch.Tensor],
+    batch: Dict[str, Any],
+    thr_vec: float,
+    thr_chk: float,
+    thr_trj: float,
+) -> Tuple[int, int, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Returns:
+        num_success, num_total, mse_vec_per_sample, mse_chk_per_sample, mse_trj_per_sample
+    where per-sample MSE is averaged over all non-batch dims.
+    """
+    vec_pred = _pred_action(pred, "action_vector")
+    chk_pred = _pred_action(pred, "action_chunk")
+    trj_pred = _pred_action(pred, "action_trajectory")
+    device = vec_pred.device
+    gt_vec = _align_target(vec_pred, batch["gt_action_vector"].to(device))
+    gt_chk = _align_target(chk_pred, batch["gt_action_chunk"].to(device))
+    gt_trj = _align_target(trj_pred, batch["gt_action_trajectory"].to(device))
+    reduce_dims_vec = list(range(1, vec_pred.dim()))
+    reduce_dims_chk = list(range(1, chk_pred.dim()))
+    reduce_dims_trj = list(range(1, trj_pred.dim()))
+    mse_vec_s = ((vec_pred - gt_vec) ** 2).mean(dim=reduce_dims_vec)
+    mse_chk_s = ((chk_pred - gt_chk) ** 2).mean(dim=reduce_dims_chk)
+    mse_trj_s = ((trj_pred - gt_trj) ** 2).mean(dim=reduce_dims_trj)
+    success_mask = (mse_vec_s < thr_vec) & (mse_chk_s < thr_chk) & (mse_trj_s < thr_trj)
+    num_success = int(success_mask.sum().item())
+    num_total = int(success_mask.numel())
+    return num_success, num_total, mse_vec_s, mse_chk_s, mse_trj_s
+@torch.no_grad()
+def compute_telepathy_stability(pred: Dict[str, torch.Tensor]) -> float:
+    tau = pred.get("telepathy_factors", None)
+    if tau is None:
+        return float("nan")
+    return float((tau ** 2).mean().item())
+@torch.no_grad()
+def cosine_alignment(a: torch.Tensor, b: torch.Tensor) -> float:
+    """
+    Cosine alignment that is robust to hidden-size mismatch.
+    Accepts [B, D] or [B, T, D]. Pools time if present.
+    If dims differ, crops both to min(Da, Db) for a fair cosine check.
+    """
+    if a.dim() == 3:
+        a = a.mean(dim=1)
+    if b.dim() == 3:
+        b = b.mean(dim=1)
+    if a.numel() == 0 or b.numel() == 0:
+        return float("nan")
+    da, db = a.size(-1), b.size(-1)
+    if da != db:
+        d = min(da, db)
+        a = a[..., :d]
+        b = b[..., :d]
+    a = F.normalize(a, dim=-1)
+    b = F.normalize(b, dim=-1)
+    return float((a * b).sum(dim=-1).mean().item())
+@torch.no_grad()
+def build_text_tokens_from_policy(
+    tokenizer,
+    text_embed_layer: nn.Module,
+    texts: List[str],
+    device: torch.device,
+    target_dtype: torch.dtype,
+    max_text_len: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Tokenize prompts and map to embeddings using PI05 internal embedding layer.
+    Returns (text_tokens, attn_mask).
+    """
+    if max_text_len and max_text_len > 0:
+        tok = tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            max_length=max_text_len,
+            return_tensors="pt",
+        )
+    else:
+        tok = tokenizer(
+            texts,
+            padding=True,
+            truncation=False,
+            return_tensors="pt",
+        )
+    if hasattr(tok, "input_ids"):
+        input_ids = tok.input_ids
+        attn_mask = tok.attention_mask
+    else:
+        input_ids = tok["input_ids"]
+        attn_mask = tok.get("attention_mask", None)
+        if attn_mask is None:
+            attn_mask = torch.ones_like(input_ids)
+    input_ids = input_ids.to(device)
+    attn_mask = attn_mask.to(device)
+    text_tokens = text_embed_layer(input_ids).to(dtype=target_dtype)
+    return text_tokens, attn_mask
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--sigma_env", type=str, default="sigma.env")
+    parser.add_argument("--shard_dir", type=str, default="")
+    parser.add_argument("--output_dir", type=str, default="./sigma_eval_out")
+    parser.add_argument(
+        "--base_model_id",
+        type=str,
+        required=True,
+        help="LeRobot/OpenPI policy repo, e.g., lerobot/pi05_base or your Sigma policy repo.",
+    )
+    parser.add_argument(
+        "--telepathy_heads_path",
+        type=str,
+        default="",
+        help="Path to sigma_telepathy_heads.pt. If empty, auto-fetch may fill it.",
+    )
+    parser.add_argument(
+        "--disable_telepathy",
+        action="store_true",
+        help="Disable telepathy injection (control run).",
+    )
+    parser.add_argument(
+        "--tokenizer_id",
+        type=str,
+        default="",
+        help="Explicit HF tokenizer id OR local tokenizer folder path.",
+    )
+    parser.add_argument("--max_text_len", type=int, default=0)
+    parser.add_argument(
+        "--artifacts_repo_id",
+        type=str,
+        default="",
+        help="HF repo containing storage/sigma_pickplace and storage/sigma_lora_out.",
+    )
+    parser.add_argument(
+        "--hf_cache_root",
+        type=str,
+        default="/workspace/.hf_sigma_cache",
+    )
+    parser.add_argument("--load_in_4bit", action="store_true")
+    parser.add_argument("--dtype", type=str, default="bf16")
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--num_workers", type=int, default=2)
+    parser.add_argument("--max_batches", type=int, default=-1)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument(
+        "--shuffle",
+        action="store_true",
+        help="Shuffle dataset order to enable different random subsets per seed.",
+    )
+    parser.add_argument(
+        "--telepathy_scale",
+        type=float,
+        default=1.0,
+        help="Multiply telepathy_factors (tau) to control injection strength.",
+    )
+    parser.add_argument("--succ_thr_vec", type=float, default=0.05)
+    parser.add_argument("--succ_thr_chk", type=float, default=0.10)
+    parser.add_argument("--succ_thr_trj", type=float, default=0.10)
+    # Hard-set thresholds: if <=0, they default to 2x the success thresholds.
+    parser.add_argument(
+        "--hard_thr_vec",
+        type=float,
+        default=-1.0,
+        help="Per-sample MSE threshold for the 'hard' set on vector branch; <=0 means 2x succ_thr_vec.",
+    )
+    parser.add_argument(
+        "--hard_thr_chk",
+        type=float,
+        default=-1.0,
+        help="Per-sample MSE threshold for the 'hard' set on chunk branch; <=0 means 2x succ_thr_chk.",
+    )
+    parser.add_argument(
+        "--hard_thr_trj",
+        type=float,
+        default=-1.0,
+        help="Per-sample MSE threshold for the 'hard' set on trajectory branch; <=0 means 2x succ_thr_trj.",
+    )
+    parser.add_argument(
+        "--strict_pi05_load",
+        action="store_true",
+        help="Try strict PI05Policy loading if supported by LeRobot.",
+    )
+    parser.add_argument(
+        "--allow_tokenizer_mismatch",
+        action="store_true",
+        help="Do not fail on tokenizer/embedding mismatch (NOT recommended for baseline).",
+    )
+    # Simple flag to enable/disable the adapter without touching telepathy itself.
+    parser.add_argument(
+        "--use_telepathy_adapter",
+        action="store_true",
+        help="If set and telepathy is enabled, apply sigma_telepathy_adapter to actions in eval.",
+    )
+    args = parser.parse_args()
+    if os.path.exists(args.sigma_env):
+        load_dotenv(args.sigma_env)
+    hf_token = os.getenv("HF_TOKEN", None)
+    accelerator = Accelerator(mixed_precision=args.dtype if args.dtype != "fp32" else "no")
+    set_seed(args.seed)
+    device = accelerator.device
+    if args.load_in_4bit:
+        print("[WARN] --load_in_4bit is ignored for PI05Policy evaluator.")
+    artifacts_repo = args.artifacts_repo_id.strip()
+    if not artifacts_repo and args.base_model_id.startswith("Veltraxor/"):
+        artifacts_repo = args.base_model_id
+    need_shards = (not args.shard_dir) or (not os.path.isdir(args.shard_dir))
+    need_heads = (not args.telepathy_heads_path) or (not os.path.isfile(args.telepathy_heads_path))
+    if artifacts_repo and (need_shards or need_heads):
+        paths = ensure_sigma_artifacts_from_hf(
+            repo_id=artifacts_repo,
+            hf_token=hf_token,
+            local_cache_root=args.hf_cache_root,
+        )
+        if need_shards:
+            args.shard_dir = paths["shard_dir"]
+            print(f"[INFO] Using cached shard_dir: {args.shard_dir}")
+        if need_heads:
+            args.telepathy_heads_path = paths["telepathy_heads_path"]
+            print(f"[INFO] Using cached telepathy_heads_path: {args.telepathy_heads_path}")
+    if not args.shard_dir or not os.path.isdir(args.shard_dir):
+        raise FileNotFoundError(
+            f"shard_dir not found locally: {args.shard_dir}. "
+            "Either provide a valid local path or an artifacts_repo_id for auto-download."
+        )
+    if not args.telepathy_heads_path or not os.path.isfile(args.telepathy_heads_path):
+        raise FileNotFoundError(
+            f"telepathy_heads_path not found locally: {args.telepathy_heads_path}. "
+            "Either provide a valid local path or store it under storage/sigma_lora_out/ "
+            "in artifacts_repo_id for auto-download."
+        )
+    policy = load_pi05_policy(
+        args.base_model_id,
+        hf_token,
+        device=device,
+        strict_load=args.strict_pi05_load,
+    )
+    tokenizer = get_policy_tokenizer(
+        policy,
+        args.base_model_id,
+        hf_token,
+        forced_tokenizer_id=args.tokenizer_id,
+    )
+    text_embed_layer = get_policy_text_embedding_layer(policy)
+    verify_tokenizer_embedding_compat(
+        tokenizer=tokenizer,
+        text_embed_layer=text_embed_layer,
+        allow_mismatch=args.allow_tokenizer_mismatch,
+    )
+    v_cfg = VisionConfig()
+    l_cfg = LanguageConfig()
+    a_cfg = ActionConfig()
+    telepathy_vla = TelepathyVLA(v_cfg, l_cfg, a_cfg, disable_telepathy=args.disable_telepathy)
+    telepathy_vla.telepathy_scale = args.telepathy_scale
+    # Instantiate Telepathy adapter (used only when telepathy is enabled and flag is set).
+    adapter_cfg = SigmaTelepathyAdapterConfig()
+    telepathy_adapter = SigmaTelepathyAdapter(adapter_cfg).to(device)
+    if accelerator.is_main_process:
+        file_size_mb = os.path.getsize(args.telepathy_heads_path) / (1024 * 1024)
+        print(f"[CHECK-A] disable_telepathy={args.disable_telepathy}")
+        print(f"[CHECK-A] telepathy_heads_path={args.telepathy_heads_path} size={file_size_mb:.2f}MB")
+    sd = torch.load(args.telepathy_heads_path, map_location="cpu")
+    tensor_list = [v.detach().float().reshape(-1) for v in sd.values() if torch.is_tensor(v)]
+    if accelerator.is_main_process and len(tensor_list) > 0:
+        capped = [t[:100000] for t in tensor_list]
+        flat = torch.cat(capped, dim=0)
+        rms = torch.sqrt((flat ** 2).mean()).item()
+        print(f"[CHECK-A] heads_tensors={len(tensor_list)} mean={flat.mean().item():.6f} std={flat.std().item():.6f} rms={rms:.6f}")
+    missing, unexpected = telepathy_vla.load_state_dict(sd, strict=False)
+    if accelerator.is_main_process:
+        if len(missing) > 0 or len(unexpected) > 0:
+            print(f"[CHECK-A] loaded with strict=False. Missing={len(missing)} Unexpected={len(unexpected)}")
+            print(f"[CHECK-A] Missing keys (first 20): {missing[:20]}")
+            print(f"[CHECK-A] Unexpected keys (first 20): {unexpected[:20]}")
+        else:
+            print("[CHECK-A] heads fully matched (no missing/unexpected).")
+    telepathy_vla.eval()
+    ds = SigmaShardDataset(args.shard_dir)
+    dl = DataLoader(
+        ds,
+        batch_size=args.batch_size,
+        shuffle=args.shuffle,
+        num_workers=args.num_workers,
+        collate_fn=collate_sigma,
+        drop_last=False,
+        pin_memory=torch.cuda.is_available(),
+    )
+    telepathy_vla, dl = accelerator.prepare(telepathy_vla, dl)
+    target_dtype = next(telepathy_vla.parameters()).dtype
+    sum_mse_vec = 0.0
+    sum_mse_chk = 0.0
+    sum_mse_trj = 0.0
+    sum_tau_l2 = 0.0
+    sum_sem_align = 0.0
+    # Hard-set aggregators
+    hard_thr_vec = args.hard_thr_vec if args.hard_thr_vec > 0.0 else 2.0 * args.succ_thr_vec
+    hard_thr_chk = args.hard_thr_chk if args.hard_thr_chk > 0.0 else 2.0 * args.succ_thr_chk
+    hard_thr_trj = args.hard_thr_trj if args.hard_thr_trj > 0.0 else 2.0 * args.succ_thr_trj
+    sum_hard_mse_vec = 0.0
+    sum_hard_mse_chk = 0.0
+    sum_hard_mse_trj = 0.0
+    total_hard_samples = 0
+    n_batches = 0
+    n_samples = 0
+    os.makedirs(args.output_dir, exist_ok=True)
+    for bidx, batch in enumerate(dl):
+        if args.max_batches > 0 and bidx >= args.max_batches:
+            break
+        telepathy_vla.reset_memory()
+        B = batch["vis_obs"].size(0)
+        n_samples += B
+        text_tokens, attn_mask = build_text_tokens_from_policy(
+            tokenizer=tokenizer,
+            text_embed_layer=text_embed_layer,
+            texts=batch["texts"],
+            device=device,
+            target_dtype=target_dtype,
+            max_text_len=args.max_text_len,
+        )
+        robot_state = batch["robot_state"].to(device)
+        if robot_state.dim() == 3:
+            robot_state = robot_state[:, -1]
+        # Move optional base actions to device for the adapter.
+        if "base_action_vector" in batch:
+            batch["base_action_vector"] = batch["base_action_vector"].to(device)
+        if "base_action_chunk" in batch:
+            batch["base_action_chunk"] = batch["base_action_chunk"].to(device)
+        if "base_action_trajectory" in batch:
+            batch["base_action_trajectory"] = batch["base_action_trajectory"].to(device)
+        try:
+            expected_d = telepathy_vla.vision.state_encoder.mlp[0].in_features
+        except Exception:
+            expected_d = robot_state.size(-1)
+        cur_d = robot_state.size(-1)
+        if cur_d < expected_d:
+            robot_state = F.pad(robot_state, (0, expected_d - cur_d))
+        elif cur_d > expected_d:
+            robot_state = robot_state[..., :expected_d]
+        pred = telepathy_vla.forward_once(
+            vis_obs=batch["vis_obs"].to(device),
+            robot_state=robot_state,
+            depth_obs=batch["depth_obs"].to(device) if batch["depth_obs"] is not None else None,
+            audio_obs=batch["audio_obs"].to(device) if batch["audio_obs"] is not None else None,
+            text_tokens=text_tokens,
+            attn_mask=attn_mask,
+            return_intermediate=True,
+        )
+        if accelerator.is_main_process and bidx == 0:
+            model_ref = telepathy_vla.module if hasattr(telepathy_vla, "module") else telepathy_vla
+            model_ref.reset_memory()
+            prev_flag = bool(model_ref.disable_telepathy)
+            model_ref.disable_telepathy = True
+            pred_ctrl = model_ref.forward_once(
+                vis_obs=batch["vis_obs"].to(device),
+                robot_state=robot_state,
+                depth_obs=batch["depth_obs"].to(device) if batch["depth_obs"] is not None else None,
+                audio_obs=batch["audio_obs"].to(device) if batch["audio_obs"] is not None else None,
+                text_tokens=text_tokens,
+                attn_mask=attn_mask,
+                return_intermediate=False,
+            )
+            model_ref.disable_telepathy = prev_flag
+            try:
+                act_exp = _pred_action(pred, "action_vector")
+                act_ctl = _pred_action(pred_ctrl, "action_vector")
+                diff = (act_exp - act_ctl).abs().mean().item()
+                print(f"[CHECK-B] telepathy_effect_mean_abs_diff(action_vector)={diff:.6f}")
+            except Exception as e:
+                print(f"[CHECK-B] action diff check failed: {type(e).__name__}: {e}")
+        # Apply Telepathy adapter only when telepathy is enabled and the flag is set.
+        if (not args.disable_telepathy) and args.use_telepathy_adapter:
+            pred = telepathy_adapter(pred, batch)
+        mse = compute_branch_mse(pred, batch)
+        tau_l2 = compute_telepathy_stability(pred)
+        (
+            _,
+            _,
+            mse_vec_s,
+            mse_chk_s,
+            mse_trj_s,
+        ) = compute_success_proxy(
+            pred,
+            batch,
+            thr_vec=args.succ_thr_vec,
+            thr_chk=args.succ_thr_chk,
+            thr_trj=args.succ_thr_trj,
+        )
+        # Hard-set accumulation: samples where any branch MSE exceeds hard thresholds
+        hard_mask = (mse_vec_s > hard_thr_vec) | (mse_chk_s > hard_thr_chk) | (mse_trj_s > hard_thr_trj)
+        hard_count = int(hard_mask.sum().item())
+        if hard_count > 0:
+            sum_hard_mse_vec += mse_vec_s[hard_mask].sum().item()
+            sum_hard_mse_chk += mse_chk_s[hard_mask].sum().item()
+            sum_hard_mse_trj += mse_trj_s[hard_mask].sum().item()
+            total_hard_samples += hard_count
+        sem_factors = pred.get("semantic_factors", None)
+        if sem_factors is not None:
+            if sem_factors.dim() == 3:
+                sem_pool = sem_factors.mean(dim=1)
+            elif sem_factors.dim() == 2:
+                sem_pool = sem_factors
+            else:
+                sem_pool = sem_factors.view(sem_factors.size(0), -1)
+            txt_pool = text_tokens.mean(dim=1)
+            sem_align = cosine_alignment(sem_pool, txt_pool)
+        else:
+            sem_align = float("nan")
+        sum_mse_vec += mse["mse_vector"]
+        sum_mse_chk += mse["mse_chunk"]
+        sum_mse_trj += mse["mse_traj"]
+        if not (tau_l2 != tau_l2):
+            sum_tau_l2 += tau_l2
+        if not (sem_align != sem_align):
+            sum_sem_align += sem_align
+        n_batches += 1
+        if accelerator.is_main_process and bidx % 20 == 0:
+            print(
+                f"batch={bidx} "
+                f"mse_vec={mse['mse_vector']:.4f} mse_chk={mse['mse_chunk']:.4f} mse_trj={mse['mse_traj']:.4f} "
+                f"tau_l2={tau_l2:.4f} sem_align={sem_align:.4f}"
+            )
+    if accelerator.is_main_process:
+        avg_mse_vec = sum_mse_vec / max(1, n_batches)
+        avg_mse_chk = sum_mse_chk / max(1, n_batches)
+        avg_mse_trj = sum_mse_trj / max(1, n_batches)
+        avg_tau_l2 = sum_tau_l2 / max(1, n_batches)
+        avg_sem_align = sum_sem_align / max(1, n_batches)
+        if total_hard_samples > 0:
+            avg_hard_mse_vec = sum_hard_mse_vec / float(total_hard_samples)
+            avg_hard_mse_chk = sum_hard_mse_chk / float(total_hard_samples)
+            avg_hard_mse_trj = sum_hard_mse_trj / float(total_hard_samples)
+        else:
+            avg_hard_mse_vec = float("nan")
+            avg_hard_mse_chk = float("nan")
+            avg_hard_mse_trj = float("nan")
+        hard_fraction = float(total_hard_samples / max(1, n_samples))
+        report = {
+            "num_samples": n_samples,
+            "num_batches": n_batches,
+            "avg_mse_vector": avg_mse_vec,
+            "avg_mse_chunk": avg_mse_chk,
+            "avg_mse_traj": avg_mse_trj,
+            "avg_tau_l2": avg_tau_l2,
+            "avg_semantic_text_alignment": avg_sem_align,
+            "hard_thresholds": {
+                "vec": hard_thr_vec,
+                "chk": hard_thr_chk,
+                "trj": hard_thr_trj,
+            },
+            "avg_hard_mse_vector": avg_hard_mse_vec,
+            "avg_hard_mse_chunk": avg_hard_mse_chk,
+            "avg_hard_mse_traj": avg_hard_mse_trj,
+            "hard_sample_fraction": hard_fraction,
+            "total_hard_samples": int(total_hard_samples),
+        }
+        with open(
+            os.path.join(args.output_dir, "sigma_eval_report.json"),
+            "w",
+            encoding="utf-8",
+        ) as f:
+            json.dump(report, f, indent=2)
+        print("[DONE] Saved report:", report)
+if __name__ == "__main__":
+    main()

modeling_pi05.py ADDED Viewed

	@@ -0,0 +1,1264 @@

+#!/usr/bin/env python
+# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import builtins
+import logging
+import math
+from collections import deque
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal, TypedDict
+import torch
+import torch.nn.functional as F  # noqa: N812
+from torch import Tensor, nn
+from typing_extensions import Unpack
+from lerobot.utils.import_utils import _transformers_available
+# Conditional import for type checking and lazy loading
+if TYPE_CHECKING or _transformers_available:
+    from transformers.models.auto import CONFIG_MAPPING
+    from transformers.models.gemma import modeling_gemma
+    from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
+    from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
+else:
+    CONFIG_MAPPING = None
+    modeling_gemma = None
+    GemmaForCausalLM = None
+    PaliGemmaForConditionalGeneration = None
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.policies.pi05.configuration_pi05 import PI05Config
+from lerobot.policies.pretrained import PreTrainedPolicy, T
+from lerobot.policies.rtc.modeling_rtc import RTCProcessor
+from lerobot.utils.constants import (
+    ACTION,
+    OBS_LANGUAGE_ATTENTION_MASK,
+    OBS_LANGUAGE_TOKENS,
+    OPENPI_ATTENTION_MASK_VALUE,
+)
+class ActionSelectKwargs(TypedDict, total=False):
+    inference_delay: int | None
+    prev_chunk_left_over: Tensor | None
+    execution_horizon: int | None
+def get_safe_dtype(target_dtype, device_type):
+    """Get a safe dtype for the given device type."""
+    if device_type == "mps" and target_dtype == torch.float64:
+        return torch.float32
+    if device_type == "cpu":
+        # CPU doesn't support bfloat16, use float32 instead
+        if target_dtype == torch.bfloat16:
+            return torch.float32
+        if target_dtype == torch.float64:
+            return torch.float64
+    return target_dtype
+def create_sinusoidal_pos_embedding(  # see openpi `create_sinusoidal_pos_embedding` (exact copy)
+    time: torch.Tensor, dimension: int, min_period: float, max_period: float, device="cpu"
+) -> Tensor:
+    """Computes sine-cosine positional embedding vectors for scalar positions."""
+    if dimension % 2 != 0:
+        raise ValueError(f"dimension ({dimension}) must be divisible by 2")
+    if time.ndim != 1:
+        raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
+    dtype = get_safe_dtype(torch.float64, device.type)
+    fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device)
+    period = min_period * (max_period / min_period) ** fraction
+    # Compute the outer product
+    scaling_factor = 1.0 / period * 2 * math.pi
+    sin_input = scaling_factor[None, :] * time[:, None]
+    return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
+def sample_beta(alpha, beta, bsize, device):  # see openpi `sample_beta` (exact copy)
+    alpha_t = torch.as_tensor(alpha, dtype=torch.float32, device=device)
+    beta_t = torch.as_tensor(beta, dtype=torch.float32, device=device)
+    dist = torch.distributions.Beta(alpha_t, beta_t)
+    return dist.sample((bsize,))
+def make_att_2d_masks(pad_masks, att_masks):  # see openpi `make_att_2d_masks` (exact copy)
+    """Copied from big_vision.
+    Tokens can attend to valid inputs tokens which have a cumulative mask_ar
+    smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to
+    setup several types of attention, for example:
+      [[1 1 1 1 1 1]]: pure causal attention.
+      [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between
+          themselves and the last 3 tokens have a causal attention. The first
+          entry could also be a 1 without changing behaviour.
+      [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a
+          block can attend all previous blocks and all tokens on the same block.
+    Args:
+      input_mask: bool[B, N] true if its part of the input, false if padding.
+      mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on
+        it and 0 where it shares the same attention mask as the previous token.
+    """
+    if att_masks.ndim != 2:
+        raise ValueError(att_masks.ndim)
+    if pad_masks.ndim != 2:
+        raise ValueError(pad_masks.ndim)
+    cumsum = torch.cumsum(att_masks, dim=1)
+    att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None]
+    pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None]
+    return att_2d_masks & pad_2d_masks
+def pad_vector(vector, new_dim):
+    """Pad the last dimension of a vector to new_dim with zeros.
+    Can be (batch_size x sequence_length x features_dimension)
+    or (batch_size x features_dimension)
+    """
+    if vector.shape[-1] >= new_dim:
+        return vector
+    return F.pad(vector, (0, new_dim - vector.shape[-1]))
+def resize_with_pad_torch(  # see openpi `resize_with_pad_torch` (exact copy)
+    images: torch.Tensor,
+    height: int,
+    width: int,
+    mode: str = "bilinear",
+) -> torch.Tensor:
+    """PyTorch version of resize_with_pad. Resizes an image to a target height and width without distortion
+    by padding with black. If the image is float32, it must be in the range [-1, 1].
+    Args:
+        images: Tensor of shape [*b, h, w, c] or [*b, c, h, w]
+        height: Target height
+        width: Target width
+        mode: Interpolation mode ('bilinear', 'nearest', etc.)
+    Returns:
+        Resized and padded tensor with same shape format as input
+    """
+    # Check if input is in channels-last format [*b, h, w, c] or channels-first [*b, c, h, w]
+    if images.shape[-1] <= 4:  # Assume channels-last format
+        channels_last = True
+        if images.dim() == 3:
+            images = images.unsqueeze(0)  # Add batch dimension
+        images = images.permute(0, 3, 1, 2)  # [b, h, w, c] -> [b, c, h, w]
+    else:
+        channels_last = False
+        if images.dim() == 3:
+            images = images.unsqueeze(0)  # Add batch dimension
+    batch_size, channels, cur_height, cur_width = images.shape
+    # Calculate resize ratio
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    # Resize
+    resized_images = F.interpolate(
+        images,
+        size=(resized_height, resized_width),
+        mode=mode,
+        align_corners=False if mode == "bilinear" else None,
+    )
+    # Handle dtype-specific clipping
+    if images.dtype == torch.uint8:
+        resized_images = torch.round(resized_images).clamp(0, 255).to(torch.uint8)
+    elif images.dtype == torch.float32:
+        resized_images = resized_images.clamp(-1.0, 1.0)
+    else:
+        raise ValueError(f"Unsupported image dtype: {images.dtype}")
+    # Calculate padding
+    pad_h0, remainder_h = divmod(height - resized_height, 2)
+    pad_h1 = pad_h0 + remainder_h
+    pad_w0, remainder_w = divmod(width - resized_width, 2)
+    pad_w1 = pad_w0 + remainder_w
+    # Pad
+    constant_value = 0 if images.dtype == torch.uint8 else -1.0
+    padded_images = F.pad(
+        resized_images,
+        (pad_w0, pad_w1, pad_h0, pad_h1),  # left, right, top, bottom
+        mode="constant",
+        value=constant_value,
+    )
+    # Convert back to original format if needed
+    if channels_last:
+        padded_images = padded_images.permute(0, 2, 3, 1)  # [b, c, h, w] -> [b, h, w, c]
+    return padded_images
+# Define the complete layer computation function for gradient checkpointing
+def compute_layer_complete(
+    layer_idx, inputs_embeds, attention_mask, position_ids, adarms_cond, paligemma, gemma_expert
+):
+    models = [paligemma.language_model, gemma_expert.model]
+    query_states = []
+    key_states = []
+    value_states = []
+    gates = []
+    for i, hidden_states in enumerate(inputs_embeds):
+        layer = models[i].layers[layer_idx]
+        hidden_states, gate = layer.input_layernorm(hidden_states, cond=adarms_cond[i])  # noqa: PLW2901
+        gates.append(gate)
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+        query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        query_states.append(query_state)
+        key_states.append(key_state)
+        value_states.append(value_state)
+    # Concatenate and process attention
+    query_states = torch.cat(query_states, dim=2)
+    key_states = torch.cat(key_states, dim=2)
+    value_states = torch.cat(value_states, dim=2)
+    dummy_tensor = torch.zeros(
+        query_states.shape[0],
+        query_states.shape[2],
+        query_states.shape[-1],
+        device=query_states.device,
+        dtype=query_states.dtype,
+    )
+    cos, sin = paligemma.model.language_model.rotary_emb(dummy_tensor, position_ids)
+    query_states, key_states = modeling_gemma.apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, unsqueeze_dim=1
+    )
+    batch_size = query_states.shape[0]
+    scaling = paligemma.language_model.layers[layer_idx].self_attn.scaling
+    # Attention computation
+    att_output, _ = modeling_gemma.eager_attention_forward(
+        paligemma.language_model.layers[layer_idx].self_attn,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        scaling,
+    )
+    # Get head_dim from the current layer, not from the model
+    head_dim = paligemma.language_model.layers[layer_idx].self_attn.head_dim
+    att_output = att_output.reshape(batch_size, -1, 1 * 8 * head_dim)
+    # Process layer outputs
+    outputs_embeds = []
+    start_pos = 0
+    for i, hidden_states in enumerate(inputs_embeds):
+        layer = models[i].layers[layer_idx]
+        end_pos = start_pos + hidden_states.shape[1]
+        if att_output.dtype != layer.self_attn.o_proj.weight.dtype:
+            att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
+        out_emb = layer.self_attn.o_proj(att_output[:, start_pos:end_pos])
+        # first residual
+        out_emb = modeling_gemma._gated_residual(hidden_states, out_emb, gates[i])  # noqa: SLF001
+        after_first_residual = out_emb.clone()
+        out_emb, gate = layer.post_attention_layernorm(out_emb, cond=adarms_cond[i])
+        # Convert to bfloat16 if the next layer (mlp) uses bfloat16
+        if layer.mlp.up_proj.weight.dtype == torch.bfloat16:
+            out_emb = out_emb.to(dtype=torch.bfloat16)
+        out_emb = layer.mlp(out_emb)
+        # second residual
+        out_emb = modeling_gemma._gated_residual(after_first_residual, out_emb, gate)  # noqa: SLF001
+        outputs_embeds.append(out_emb)
+        start_pos = end_pos
+    return outputs_embeds
+class GemmaConfig:  # see openpi `gemma.py: Config`
+    """Configuration for Gemma model variants."""
+    def __init__(self, width, depth, mlp_dim, num_heads, num_kv_heads, head_dim):
+        self.width = width
+        self.depth = depth
+        self.mlp_dim = mlp_dim
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+def get_gemma_config(variant: str) -> GemmaConfig:  # see openpi `gemma.py: get_config`
+    """Returns config for specified gemma variant."""
+    if variant == "gemma_300m":
+        return GemmaConfig(
+            width=1024,
+            depth=18,
+            mlp_dim=4096,
+            num_heads=8,
+            num_kv_heads=1,
+            head_dim=256,
+        )
+    elif variant == "gemma_2b":
+        return GemmaConfig(
+            width=2048,
+            depth=18,
+            mlp_dim=16_384,
+            num_heads=8,
+            num_kv_heads=1,
+            head_dim=256,
+        )
+    else:
+        raise ValueError(f"Unknown variant: {variant}")
+class PaliGemmaWithExpertModel(
+    nn.Module
+):  # see openpi `gemma_pytorch.py: PaliGemmaWithExpertModel` this class is almost a exact copy of PaliGemmaWithExpertModel in openpi
+    """PaliGemma model with action expert for PI05."""
+    def __init__(
+        self,
+        vlm_config,
+        action_expert_config,
+        use_adarms=None,
+        precision: Literal["bfloat16", "float32"] = "bfloat16",
+    ):
+        if use_adarms is None:
+            use_adarms = [False, False]
+        super().__init__()
+        vlm_config_hf = CONFIG_MAPPING["paligemma"]()
+        vlm_config_hf._vocab_size = 257152  # noqa: SLF001
+        vlm_config_hf.image_token_index = 257152
+        vlm_config_hf.text_config.hidden_size = vlm_config.width
+        vlm_config_hf.text_config.intermediate_size = vlm_config.mlp_dim
+        vlm_config_hf.text_config.num_attention_heads = vlm_config.num_heads
+        vlm_config_hf.text_config.head_dim = vlm_config.head_dim
+        vlm_config_hf.text_config.num_hidden_layers = vlm_config.depth
+        vlm_config_hf.text_config.num_key_value_heads = vlm_config.num_kv_heads
+        vlm_config_hf.text_config.hidden_activation = "gelu_pytorch_tanh"
+        vlm_config_hf.text_config.torch_dtype = "float32"
+        vlm_config_hf.text_config.vocab_size = 257152
+        vlm_config_hf.text_config.use_adarms = use_adarms[0]
+        vlm_config_hf.text_config.adarms_cond_dim = vlm_config.width if use_adarms[0] else None
+        vlm_config_hf.vision_config.intermediate_size = 4304
+        vlm_config_hf.vision_config.projection_dim = 2048
+        vlm_config_hf.vision_config.projector_hidden_act = "gelu_fast"
+        vlm_config_hf.vision_config.torch_dtype = "float32"
+        action_expert_config_hf = CONFIG_MAPPING["gemma"](
+            head_dim=action_expert_config.head_dim,
+            hidden_size=action_expert_config.width,
+            intermediate_size=action_expert_config.mlp_dim,
+            num_attention_heads=action_expert_config.num_heads,
+            num_hidden_layers=action_expert_config.depth,
+            num_key_value_heads=action_expert_config.num_kv_heads,
+            vocab_size=257152,
+            hidden_activation="gelu_pytorch_tanh",
+            torch_dtype="float32",
+            use_adarms=use_adarms[1],
+            adarms_cond_dim=action_expert_config.width if use_adarms[1] else None,
+        )
+        self.paligemma = PaliGemmaForConditionalGeneration(config=vlm_config_hf)
+        self.gemma_expert = GemmaForCausalLM(config=action_expert_config_hf)
+        self.gemma_expert.model.embed_tokens = None
+        self.to_bfloat16_for_selected_params(precision)
+    def to_bfloat16_for_selected_params(self, precision: Literal["bfloat16", "float32"] = "bfloat16"):
+        if precision == "bfloat16":
+            self.to(dtype=torch.bfloat16)
+        elif precision == "float32":
+            self.to(dtype=torch.float32)
+            return
+        else:
+            raise ValueError(f"Invalid precision: {precision}")
+        params_to_keep_float32 = [
+            "vision_tower.vision_model.embeddings.patch_embedding.weight",
+            "vision_tower.vision_model.embeddings.patch_embedding.bias",
+            "vision_tower.vision_model.embeddings.position_embedding.weight",
+            "input_layernorm",
+            "post_attention_layernorm",
+            "model.norm",
+        ]
+        for name, param in self.named_parameters():
+            if any(selector in name for selector in params_to_keep_float32):
+                param.data = param.data.to(dtype=torch.float32)
+    def embed_image(self, image: torch.Tensor):
+        return self.paligemma.model.get_image_features(image)
+    def embed_language_tokens(self, tokens: torch.Tensor):
+        return self.paligemma.language_model.embed_tokens(tokens)
+    def forward(
+        self,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
+        inputs_embeds: list[torch.FloatTensor] | None = None,
+        use_cache: bool | None = None,
+        adarms_cond: list[torch.Tensor] | None = None,
+    ):
+        if adarms_cond is None:
+            adarms_cond = [None, None]
+        if inputs_embeds[1] is None:
+            prefix_output = self.paligemma.language_model.forward(
+                inputs_embeds=inputs_embeds[0],
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                adarms_cond=adarms_cond[0] if adarms_cond is not None else None,
+            )
+            prefix_past_key_values = prefix_output.past_key_values
+            prefix_output = prefix_output.last_hidden_state
+            suffix_output = None
+        elif inputs_embeds[0] is None:
+            suffix_output = self.gemma_expert.model.forward(
+                inputs_embeds=inputs_embeds[1],
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                adarms_cond=adarms_cond[1] if adarms_cond is not None else None,
+            )
+            suffix_output = suffix_output.last_hidden_state
+            prefix_output = None
+            prefix_past_key_values = None
+        else:
+            models = [self.paligemma.language_model, self.gemma_expert.model]
+            num_layers = self.paligemma.config.text_config.num_hidden_layers
+            # Check if gradient checkpointing is enabled for any of the models
+            use_gradient_checkpointing = (
+                hasattr(self.gemma_expert.model, "gradient_checkpointing")
+                and self.gemma_expert.model.gradient_checkpointing
+                and self.training
+            ) or (hasattr(self, "gradient_checkpointing") and self.gradient_checkpointing and self.training)
+            # Process all layers with gradient checkpointing if enabled
+            for layer_idx in range(num_layers):
+                if use_gradient_checkpointing:
+                    inputs_embeds = torch.utils.checkpoint.checkpoint(
+                        compute_layer_complete,
+                        layer_idx,
+                        inputs_embeds,
+                        attention_mask,
+                        position_ids,
+                        adarms_cond,
+                        use_reentrant=False,
+                        preserve_rng_state=False,
+                        paligemma=self.paligemma,
+                        gemma_expert=self.gemma_expert,
+                    )
+                else:
+                    inputs_embeds = compute_layer_complete(
+                        layer_idx,
+                        inputs_embeds,
+                        attention_mask,
+                        position_ids,
+                        adarms_cond,
+                        paligemma=self.paligemma,
+                        gemma_expert=self.gemma_expert,
+                    )
+            # final norm
+            def compute_final_norms(inputs_embeds, adarms_cond):
+                outputs_embeds = []
+                for i, hidden_states in enumerate(inputs_embeds):
+                    out_emb, _ = models[i].norm(hidden_states, cond=adarms_cond[i])
+                    outputs_embeds.append(out_emb)
+                return outputs_embeds
+            # Apply gradient checkpointing to final norm if enabled
+            if use_gradient_checkpointing:
+                outputs_embeds = torch.utils.checkpoint.checkpoint(
+                    compute_final_norms,
+                    inputs_embeds,
+                    adarms_cond,
+                    use_reentrant=False,
+                    preserve_rng_state=False,
+                )
+            else:
+                outputs_embeds = compute_final_norms(inputs_embeds, adarms_cond)
+            prefix_output = outputs_embeds[0]
+            suffix_output = outputs_embeds[1]
+            prefix_past_key_values = None
+        return [prefix_output, suffix_output], prefix_past_key_values
+class PI05Pytorch(nn.Module):  # see openpi `PI0Pytorch`
+    """Core PI05 PyTorch model."""
+    def __init__(self, config: PI05Config, rtc_processor: RTCProcessor | None = None):
+        super().__init__()
+        self.config = config
+        self.rtc_processor = rtc_processor
+        paligemma_config = get_gemma_config(config.paligemma_variant)
+        action_expert_config = get_gemma_config(config.action_expert_variant)
+        self.paligemma_with_expert = PaliGemmaWithExpertModel(
+            paligemma_config,
+            action_expert_config,
+            use_adarms=[False, True],
+            precision=config.dtype,
+        )
+        self.action_in_proj = nn.Linear(config.max_action_dim, action_expert_config.width)
+        self.action_out_proj = nn.Linear(action_expert_config.width, config.max_action_dim)
+        self.time_mlp_in = nn.Linear(action_expert_config.width, action_expert_config.width)
+        self.time_mlp_out = nn.Linear(action_expert_config.width, action_expert_config.width)
+        # Initialize gradient checkpointing flag
+        self.gradient_checkpointing_enabled = False
+        # Compile model if requested
+        if config.compile_model:
+            torch.set_float32_matmul_precision("high")
+            self.sample_actions = torch.compile(self.sample_actions, mode=config.compile_mode)
+        msg = """An incorrect transformer version is used, please create an issue on https://github.com/huggingface/lerobot/issues"""
+        # PATCH: make transformers version guard non-fatal and robust across versions
+        try:
+            from transformers.models.siglip import check
+            if hasattr(check, "check_whether_transformers_replace_is_installed_correctly"):
+                ok = check.check_whether_transformers_replace_is_installed_correctly()
+                if not ok:
+                    logging.warning("[pi05] %s", msg)
+            else:
+                logging.warning(
+                    "[patch_pi05] SigLIP check helper missing; skipping strict transformers version guard."
+                )
+        except Exception as e:  # noqa: BLE001
+            logging.warning(
+                "[patch_pi05] Could not run transformers version guard (%s). "
+                "Continuing without strict transformers check. %s",
+                msg,
+                e,
+            )
+    def gradient_checkpointing_enable(self):
+        """Enable gradient checkpointing for memory optimization."""
+        self.gradient_checkpointing_enabled = True
+        self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = True
+        self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = True
+        self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = True
+        logging.info("Enabled gradient checkpointing for PI05Pytorch model")
+    def gradient_checkpointing_disable(self):
+        """Disable gradient checkpointing."""
+        self.gradient_checkpointing_enabled = False
+        self.paligemma_with_expert.paligemma.language_model.gradient_checkpointing = False
+        self.paligemma_with_expert.paligemma.vision_tower.gradient_checkpointing = False
+        self.paligemma_with_expert.gemma_expert.model.gradient_checkpointing = False
+        logging.info("Disabled gradient checkpointing for PI05Pytorch model")
+    def _rtc_enabled(self):
+        return self.config.rtc_config is not None and self.config.rtc_config.enabled
+    def _apply_checkpoint(self, func, *args, **kwargs):
+        """Helper method to apply gradient checkpointing if enabled."""
+        if self.gradient_checkpointing_enabled and self.training:
+            return torch.utils.checkpoint.checkpoint(
+                func, *args, use_reentrant=False, preserve_rng_state=False, **kwargs
+            )
+        return func(*args, **kwargs)
+    def _prepare_attention_masks_4d(self, att_2d_masks):
+        """Helper method to prepare 4D attention masks for transformer."""
+        att_2d_masks_4d = att_2d_masks[:, None, :, :]
+        return torch.where(att_2d_masks_4d, 0.0, OPENPI_ATTENTION_MASK_VALUE)
+    def sample_noise(self, shape, device):
+        return torch.normal(
+            mean=0.0,
+            std=1.0,
+            size=shape,
+            dtype=torch.float32,
+            device=device,
+        )
+    def sample_time(self, bsize, device):
+        time_beta = sample_beta(
+            self.config.time_sampling_beta_alpha, self.config.time_sampling_beta_beta, bsize, device
+        )
+        time = time_beta * self.config.time_sampling_scale + self.config.time_sampling_offset
+        return time.to(dtype=torch.float32, device=device)
+    def embed_prefix(
+        self, images, img_masks, tokens, masks
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Embed images with SigLIP and language tokens with embedding layer."""
+        embs = []
+        pad_masks = []
+        att_masks = []
+        # Process images
+        for img, img_mask in zip(images, img_masks, strict=True):
+            def image_embed_func(img):
+                return self.paligemma_with_expert.embed_image(img)
+            img_emb = self._apply_checkpoint(image_embed_func, img)
+            bsize, num_img_embs = img_emb.shape[:2]
+            embs.append(img_emb)
+            pad_masks.append(img_mask[:, None].expand(bsize, num_img_embs))
+            att_masks += [0] * num_img_embs
+        # Process language tokens
+        def lang_embed_func(tokens):
+            lang_emb = self.paligemma_with_expert.embed_language_tokens(tokens)
+            lang_emb_dim = lang_emb.shape[-1]
+            return lang_emb * math.sqrt(lang_emb_dim)
+        lang_emb = self._apply_checkpoint(lang_embed_func, tokens)
+        embs.append(lang_emb)
+        pad_masks.append(masks)
+        num_lang_embs = lang_emb.shape[1]
+        att_masks += [0] * num_lang_embs
+        embs = torch.cat(embs, dim=1)
+        pad_masks = torch.cat(pad_masks, dim=1)
+        att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
+        bsize = pad_masks.shape[0]
+        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+        return embs, pad_masks, att_masks
+    def embed_suffix(self, noisy_actions, timestep):
+        """Embed noisy_actions, timestep to prepare for Expert Gemma processing."""
+        embs = []
+        pad_masks = []
+        att_masks = []
+        # Embed timestep using sine-cosine positional encoding
+        time_emb = create_sinusoidal_pos_embedding(
+            timestep,
+            self.action_in_proj.out_features,
+            min_period=self.config.min_period,
+            max_period=self.config.max_period,
+            device=timestep.device,
+        )
+        time_emb = time_emb.type(dtype=timestep.dtype)
+        # Fuse timestep + action information using an MLP
+        def action_proj_func(noisy_actions):
+            return self.action_in_proj(noisy_actions)
+        action_emb = self._apply_checkpoint(action_proj_func, noisy_actions)
+        def time_mlp_func(time_emb):
+            x = self.time_mlp_in(time_emb)
+            x = F.silu(x)
+            x = self.time_mlp_out(x)
+            return F.silu(x)
+        time_emb = self._apply_checkpoint(time_mlp_func, time_emb)
+        action_time_emb = action_emb
+        adarms_cond = time_emb
+        embs.append(action_time_emb)
+        bsize, action_time_dim = action_time_emb.shape[:2]
+        action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=timestep.device)
+        pad_masks.append(action_time_mask)
+        # Set attention masks so that image, language and state inputs do not attend to action tokens
+        att_masks += [1] + ([0] * (self.config.chunk_size - 1))
+        embs = torch.cat(embs, dim=1)
+        pad_masks = torch.cat(pad_masks, dim=1)
+        att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
+        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+        return embs, pad_masks, att_masks, adarms_cond
+    def forward(self, images, img_masks, tokens, masks, actions, noise=None, time=None) -> Tensor:
+        """Do a full training forward pass and compute the loss."""
+        if noise is None:
+            noise = self.sample_noise(actions.shape, actions.device)
+        if time is None:
+            time = self.sample_time(actions.shape[0], actions.device)
+        time_expanded = time[:, None, None]
+        x_t = time_expanded * noise + (1 - time_expanded) * actions
+        u_t = noise - actions
+        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(images, img_masks, tokens, masks)
+        suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(x_t, time)
+        if (
+            self.paligemma_with_expert.paligemma.language_model.layers[0].self_attn.q_proj.weight.dtype
+            == torch.bfloat16
+        ):
+            suffix_embs = suffix_embs.to(dtype=torch.bfloat16)
+            prefix_embs = prefix_embs.to(dtype=torch.bfloat16)
+        pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
+        att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
+        att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+        position_ids = torch.cumsum(pad_masks, dim=1) - 1
+        att_2d_masks_4d = self._prepare_attention_masks_4d(att_2d_masks)
+        def forward_func(prefix_embs, suffix_embs, att_2d_masks_4d, position_ids, adarms_cond):
+            (_, suffix_out), _ = self.paligemma_with_expert.forward(
+                attention_mask=att_2d_masks_4d,
+                position_ids=position_ids,
+                past_key_values=None,
+                inputs_embeds=[prefix_embs, suffix_embs],
+                use_cache=False,
+                adarms_cond=[None, adarms_cond],
+            )
+            return suffix_out
+        suffix_out = self._apply_checkpoint(
+            forward_func, prefix_embs, suffix_embs, att_2d_masks_4d, position_ids, adarms_cond
+        )
+        suffix_out = suffix_out[:, -self.config.chunk_size :]
+        suffix_out = suffix_out.to(dtype=torch.float32)
+        def action_out_proj_func(suffix_out):
+            return self.action_out_proj(suffix_out)
+        v_t = self._apply_checkpoint(action_out_proj_func, suffix_out)
+        return F.mse_loss(u_t, v_t, reduction="none")
+    @torch.no_grad()  # see openpi `sample_actions` (slightly adapted)
+    def sample_actions(
+        self,
+        images,
+        img_masks,
+        tokens,
+        masks,
+        noise=None,
+        num_steps=None,
+        **kwargs: Unpack[ActionSelectKwargs],
+    ) -> Tensor:
+        """Do a full inference forward and compute the action."""
+        if num_steps is None:
+            num_steps = self.config.num_inference_steps
+        bsize = tokens.shape[0]
+        device = tokens.device
+        if noise is None:
+            # Sample noise with padded dimension as expected by action_in_proj
+            actions_shape = (
+                bsize,
+                self.config.chunk_size,
+                self.config.max_action_dim,
+            )  # Use config max_action_dim for internal processing
+            noise = self.sample_noise(actions_shape, device)
+        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(images, img_masks, tokens, masks)
+        prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
+        prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
+        prefix_att_2d_masks_4d = self._prepare_attention_masks_4d(prefix_att_2d_masks)
+        self.paligemma_with_expert.paligemma.language_model.config._attn_implementation = "eager"  # noqa: SLF001
+        _, past_key_values = self.paligemma_with_expert.forward(
+            attention_mask=prefix_att_2d_masks_4d,
+            position_ids=prefix_position_ids,
+            past_key_values=None,
+            inputs_embeds=[prefix_embs, None],
+            use_cache=True,
+        )
+        dt = -1.0 / num_steps
+        dt = torch.tensor(dt, dtype=torch.float32, device=device)
+        x_t = noise
+        time = torch.tensor(1.0, dtype=torch.float32, device=device)
+        while time >= -dt / 2:
+            expanded_time = time.expand(bsize)
+            # Define a closure function to properly capture expanded_time
+            # This avoids the lambda expression (E731) and loop variable binding (B023) issues
+            def denoise_step_partial_call(input_x_t, current_timestep=expanded_time):
+                return self.denoise_step(
+                    prefix_pad_masks=prefix_pad_masks,
+                    past_key_values=past_key_values,
+                    x_t=input_x_t,
+                    timestep=current_timestep,
+                )
+            if self._rtc_enabled():
+                inference_delay = kwargs.get("inference_delay")
+                prev_chunk_left_over = kwargs.get("prev_chunk_left_over")
+                execution_horizon = kwargs.get("execution_horizon")
+                v_t = self.rtc_processor.denoise_step(
+                    x_t=x_t,
+                    prev_chunk_left_over=prev_chunk_left_over,
+                    inference_delay=inference_delay,
+                    time=time,
+                    original_denoise_step_partial=denoise_step_partial_call,
+                    execution_horizon=execution_horizon,
+                )
+            else:
+                v_t = denoise_step_partial_call(x_t)
+            # Euler step
+            x_t += dt * v_t
+            # Record x_t and v_t after Euler step
+            if self.rtc_processor is not None and self.rtc_processor.is_debug_enabled():
+                self.rtc_processor.track(time=time, x_t=x_t, v_t=v_t)
+            time += dt
+        return x_t
+    def denoise_step(
+        self,
+        prefix_pad_masks,
+        past_key_values,
+        x_t,
+        timestep,
+    ):
+        """Apply one denoising step of the noise `x_t` at a given timestep."""
+        suffix_embs, suffix_pad_masks, suffix_att_masks, adarms_cond = self.embed_suffix(x_t, timestep)
+        suffix_len = suffix_pad_masks.shape[1]
+        batch_size = prefix_pad_masks.shape[0]
+        prefix_len = prefix_pad_masks.shape[1]
+        prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len)
+        suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)
+        full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2)
+        prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None]
+        position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1
+        full_att_2d_masks_4d = self._prepare_attention_masks_4d(full_att_2d_masks)
+        self.paligemma_with_expert.gemma_expert.model.config._attn_implementation = "eager"  # noqa: SLF001
+        outputs_embeds, _ = self.paligemma_with_expert.forward(
+            attention_mask=full_att_2d_masks_4d,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=[None, suffix_embs],
+            use_cache=False,
+            adarms_cond=[None, adarms_cond],
+        )
+        suffix_out = outputs_embeds[1]
+        suffix_out = suffix_out[:, -self.config.chunk_size :]
+        suffix_out = suffix_out.to(dtype=torch.float32)
+        return self.action_out_proj(suffix_out)
+class PI05Policy(PreTrainedPolicy):
+    """PI05 Policy for LeRobot."""
+    config_class = PI05Config
+    name = "pi05"
+    def __init__(
+        self,
+        config: PI05Config,
+    ):
+        """
+        Args:
+            config: Policy configuration class instance.
+        """
+        super().__init__(config)
+        config.validate_features()
+        self.config = config
+        # Initialize the core PI05 model
+        self.init_rtc_processor()
+        self.model = PI05Pytorch(config, rtc_processor=self.rtc_processor)
+        # Enable gradient checkpointing if requested
+        if config.gradient_checkpointing:
+            self.model.gradient_checkpointing_enable()
+        self.model.to(config.device)
+        self.reset()
+    @classmethod
+    def from_pretrained(
+        cls: builtins.type[T],
+        pretrained_name_or_path: str | Path,
+        *,
+        config: PreTrainedConfig | None = None,
+        force_download: bool = False,
+        resume_download: bool | None = None,
+        proxies: dict | None = None,
+        token: str | bool | None = None,
+        cache_dir: str | Path | None = None,
+        local_files_only: bool = False,
+        revision: str | None = None,
+        strict: bool = True,
+        **kwargs,
+    ) -> T:
+        """Override the from_pretrained method to handle key remapping and display important disclaimer."""
+        print(
+            "The PI05 model is a direct port of the OpenPI implementation. \n"
+            "This implementation follows the original OpenPI structure for compatibility. \n"
+            "Original implementation: https://github.com/Physical-Intelligence/openpi"
+        )
+        if pretrained_name_or_path is None:
+            raise ValueError("pretrained_name_or_path is required")
+        # Use provided config if available, otherwise create default config
+        if config is None:
+            config = PreTrainedConfig.from_pretrained(
+                pretrained_name_or_path=pretrained_name_or_path,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                token=token,
+                cache_dir=cache_dir,
+                local_files_only=local_files_only,
+                revision=revision,
+                **kwargs,
+            )
+        # Initialize model without loading weights
+        # Check if dataset_stats were provided in kwargs
+        model = cls(config, **kwargs)
+        # Now manually load and remap the state dict
+        try:
+            # Try to load the pytorch_model.bin or model.safetensors file
+            print(f"Loading model from: {pretrained_name_or_path}")
+            try:
+                from transformers.utils import cached_file
+                # Try safetensors first
+                resolved_file = cached_file(
+                    pretrained_name_or_path,
+                    "model.safetensors",
+                    cache_dir=kwargs.get("cache_dir"),
+                    force_download=kwargs.get("force_download", False),
+                    resume_download=kwargs.get("resume_download"),
+                    proxies=kwargs.get("proxies"),
+                    use_auth_token=kwargs.get("use_auth_token"),
+                    revision=kwargs.get("revision"),
+                    local_files_only=kwargs.get("local_files_only", False),
+                )
+                from safetensors.torch import load_file
+                original_state_dict = load_file(resolved_file)
+                print("✓ Loaded state dict from model.safetensors")
+            except Exception as e:  # noqa: BLE001
+                print(f"Could not load state dict from remote files: {e}")
+                print("Returning model without loading pretrained weights")
+                return model
+            # First, fix any key differences # see openpi `model.py, _fix_pytorch_state_dict_keys`
+            fixed_state_dict = model._fix_pytorch_state_dict_keys(original_state_dict, model.config)
+            # Then add "model." prefix for all keys that don't already have it
+            remapped_state_dict = {}
+            remap_count = 0
+            for key, value in fixed_state_dict.items():
+                if not key.startswith("model."):
+                    new_key = f"model.{key}"
+                    remapped_state_dict[new_key] = value
+                    remap_count += 1
+                    if remap_count <= 10:  # Only print first 10 to avoid spam
+                        print(f"Remapped: {key} -> {new_key}")
+                else:
+                    remapped_state_dict[key] = value
+            if remap_count > 0:
+                print(f"Remapped {remap_count} state dict keys")
+            # Load the remapped state dict into the model
+            missing_keys, unexpected_keys = model.load_state_dict(remapped_state_dict, strict=strict)
+            # --- PATCH: tie embed_tokens to lm_head if ckpt omitted embed_tokens ---
+            if any("embed_tokens.weight" in k for k in missing_keys):
+                try:
+                    with torch.no_grad():
+                        embed = model.model.paligemma_with_expert.paligemma.model.language_model.embed_tokens
+                        lm_head = model.model.paligemma_with_expert.paligemma.lm_head
+                        if embed is not None and lm_head is not None:
+                            embed.weight = lm_head.weight
+                except Exception as _e:  # noqa: BLE001
+                    print("[patch_pi05] Could not tie embed_tokens to lm_head:", _e)
+            # --- FIX: tie embed_tokens to lm_head if embed_tokens missing in ckpt ---
+            if any("embed_tokens.weight" in k for k in missing_keys):
+                with torch.no_grad():
+                    embed = model.model.paligemma_with_expert.paligemma.model.language_model.embed_tokens
+                    lm_head = model.model.paligemma_with_expert.paligemma.lm_head
+                    embed.weight = lm_head.weight
+            if missing_keys:
+                print(f"Missing keys when loading state dict: {len(missing_keys)} keys")
+                if len(missing_keys) <= 5:
+                    for key in missing_keys:
+                        print(f"  - {key}")
+                else:
+                    for key in missing_keys[:5]:
+                        print(f"  - {key}")
+                    print(f"  ... and {len(missing_keys) - 5} more")
+            if unexpected_keys:
+                print(f"Unexpected keys when loading state dict: {len(unexpected_keys)} keys")
+                if len(unexpected_keys) <= 5:
+                    for key in unexpected_keys:
+                        print(f"  - {key}")
+                else:
+                    for key in unexpected_keys[:5]:
+                        print(f"  - {key}")
+                    print(f"  ... and {len(unexpected_keys) - 5} more")
+            if not missing_keys and not unexpected_keys:
+                print("All keys loaded successfully!")
+        except Exception as e:  # noqa: BLE001
+            print(f"Warning: Could not remap state dict keys: {e}")
+        return model
+    def _fix_pytorch_state_dict_keys(
+        self, state_dict, model_config
+    ):  # see openpi `BaseModelConfig, _fix_pytorch_state_dict_keys`
+        """Fix state dict keys to match current model architecture."""
+        import re
+        fixed_state_dict = {}
+        for key, value in state_dict.items():
+            new_key = key
+            # Handle layer norm structure changes: .weight -> .dense.weight + .dense.bias
+            # For gemma expert layers
+            if re.match(
+                r"paligemma_with_expert\.gemma_expert\.model\.layers\.\d+\.(input_layernorm|post_attention_layernorm)\.weight",
+                key,
+            ):
+                # Check if the model actually has adaRMS enabled for the expert
+                expert_uses_adarms = getattr(
+                    self.model.paligemma_with_expert.gemma_expert.config, "use_adarms", False
+                )
+                if expert_uses_adarms:
+                    logging.warning(f"Skipping layer norm key (adaRMS mismatch): {key}")
+                    continue
+            if re.match(r"paligemma_with_expert\.gemma_expert\.model\.norm\.weight", key):
+                # Check if the model actually has adaRMS enabled for the expert
+                expert_uses_adarms = getattr(
+                    self.model.paligemma_with_expert.gemma_expert.config, "use_adarms", False
+                )
+                if expert_uses_adarms:
+                    logging.warning(f"Skipping norm key (adaRMS mismatch): {key}")
+                    continue
+            # Handle MLP naming changes for pi05
+            # pi05 model expects time_mlp_*, but checkpoint might have action_time_mlp_*
+            if key.startswith("action_time_mlp_in."):
+                new_key = key.replace("action_time_mlp_in.", "time_mlp_in.")
+            elif key.startswith("action_time_mlp_out."):
+                new_key = key.replace("action_time_mlp_out.", "time_mlp_out.")
+            # Also handle state_proj which shouldn't exist in pi05
+            if key.startswith("state_proj."):
+                logging.warning(f"Skipping state_proj key in pi05 mode: {key}")
+                continue
+            # Handle vision tower embedding layer potential differences
+            if "patch_embedding" in key:
+                # Some checkpoints might have this, but current model expects different structure
+                logging.warning(f"Vision embedding key might need handling: {key}")
+            fixed_state_dict[new_key] = value
+        return fixed_state_dict
+    def get_optim_params(self) -> dict:
+        return self.parameters()
+    def reset(self):
+        """Reset internal state - called when environment resets."""
+        self._action_queue = deque(maxlen=self.config.n_action_steps)
+        self._queues = {
+            ACTION: deque(maxlen=self.config.n_action_steps),
+        }
+    def init_rtc_processor(self):
+        """Initialize RTC processor if RTC is enabled in config."""
+        self.rtc_processor = None
+        # Create processor if config provided
+        # If RTC is not enabled - we can still track the denoising data
+        if self.config.rtc_config is not None:
+            self.rtc_processor = RTCProcessor(self.config.rtc_config)
+            model_value = getattr(self, "model", None)
+            if model_value is not None:
+                model_value.rtc_processor = self.rtc_processor
+    def _rtc_enabled(self) -> bool:
+        return self.config.rtc_config is not None and self.config.rtc_config.enabled
+    def _preprocess_images(self, batch: dict[str, Tensor]) -> tuple[list[Tensor], list[Tensor]]:
+        """Preprocess images for the model.
+        Images from LeRobot are typically in [B, C, H, W] format and normalized to [0, 1].
+        PaliGemma expects images in [B, C, H, W] format and normalized to [-1, 1].
+        """
+        images = []
+        img_masks = []
+        # Get device from model parameters
+        device = next(self.parameters()).device
+        present_img_keys = [key for key in self.config.image_features if key in batch]
+        missing_img_keys = [key for key in self.config.image_features if key not in batch]
+        if len(present_img_keys) == 0:
+            raise ValueError(
+                f"All image features are missing from the batch. At least one expected. "
+                f"(batch: {batch.keys()}) (image_features: {self.config.image_features})"
+            )
+        # Preprocess image features present in the batch
+        for key in present_img_keys:
+            img = batch[key]
+            # Ensure tensor is on the same device as the model
+            if img.device != device:
+                img = img.to(device)
+            # Ensure float32 dtype for consistency
+            if img.dtype != torch.float32:
+                img = img.to(torch.float32)
+            # from openpi preprocess_observation_pytorch: Handle both [B, C, H, W] and [B, H, W, C] formats
+            is_channels_first = img.shape[1] == 3  # Check if channels are in dimension 1
+            if is_channels_first:
+                # Convert [B, C, H, W] to [B, H, W, C] for processing
+                img = img.permute(0, 2, 3, 1)
+            # from openpi preprocess_observation_pytorch: Resize with padding if needed
+            if img.shape[1:3] != self.config.image_resolution:
+                img = resize_with_pad_torch(img, *self.config.image_resolution)
+            # Normalize from [0,1] to [-1,1] as expected by siglip
+            img = img * 2.0 - 1.0
+            # from openpi preprocess_observation_pytorch: Convert back to [B, C, H, W] format if it was originally channels-first
+            if is_channels_first:
+                img = img.permute(0, 3, 1, 2)  # [B, H, W, C] -> [B, C, H, W]
+            images.append(img)
+            # Create mask (all ones for real images)
+            bsize = img.shape[0]
+            mask = torch.ones(bsize, dtype=torch.bool, device=device)
+            img_masks.append(mask)
+        # Create image features not present in the batch as fully 0 padded images
+        for _num_empty_cameras in range(len(missing_img_keys)):
+            img = torch.ones_like(img) * -1  # Padded with -1 for SigLIP
+            mask = torch.zeros_like(mask)  # Mask is zero for empty cameras
+            images.append(img)
+            img_masks.append(mask)
+        return images, img_masks
+    def prepare_action(self, batch):
+        """Pad action"""
+        actions = pad_vector(batch[ACTION], self.config.max_action_dim)
+        return actions
+    @torch.no_grad()
+    def select_action(self, batch: dict[str, Tensor]) -> Tensor:
+        """Select a single action given environment observations."""
+        assert not self._rtc_enabled(), (
+            "RTC is not supported for select_action, use it with predict_action_chunk"
+        )
+        self.eval()
+        # Action queue logic for n_action_steps > 1
+        if len(self._action_queue) == 0:
+            actions = self.predict_action_chunk(batch)[:, : self.config.n_action_steps]
+            # Transpose to get shape (n_action_steps, batch_size, action_dim)
+            self._action_queue.extend(actions.transpose(0, 1))
+        return self._action_queue.popleft()
+    @torch.no_grad()
+    def predict_action_chunk(self, batch: dict[str, Tensor], **kwargs: Unpack[ActionSelectKwargs]) -> Tensor:
+        """Predict a chunk of actions given environment observations."""
+        self.eval()
+        # Prepare inputs
+        images, img_masks = self._preprocess_images(batch)
+        tokens, masks = batch[f"{OBS_LANGUAGE_TOKENS}"], batch[f"{OBS_LANGUAGE_ATTENTION_MASK}"]
+        # Sample actions using the model (pass through RTC kwargs, no separate state needed for PI05)
+        actions = self.model.sample_actions(images, img_masks, tokens, masks, **kwargs)
+        # Unpad actions to actual action dimension
+        original_action_dim = self.config.output_features[ACTION].shape[0]
+        actions = actions[:, :, :original_action_dim]
+        return actions
+    def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]:
+        """Run the batch through the model and compute the loss for training."""
+        # Prepare inputs
+        images, img_masks = self._preprocess_images(batch)
+        tokens, masks = batch[f"{OBS_LANGUAGE_TOKENS}"], batch[f"{OBS_LANGUAGE_ATTENTION_MASK}"]
+        actions = self.prepare_action(batch)
+        # Compute loss (no separate state needed for PI05)
+        losses = self.model.forward(images, img_masks, tokens, masks, actions)
+        # Truncate losses to actual action dimensions
+        original_action_dim = self.config.output_features[ACTION].shape[0]
+        losses = losses[:, :, :original_action_dim]
+        loss = losses.mean()
+        loss_dict = {
+            "loss": loss.item(),
+            "loss_per_dim": losses.mean(dim=[0, 1]).detach().cpu().numpy().tolist(),
+        }
+        return loss, loss_dict
+# PATCH: downgrade transformer version guard

patch_sigma_env.py ADDED Viewed

	@@ -0,0 +1,432 @@

+#!/usr/bin/env python3
+"""
+patch_sigma_env.py
+Idempotent patcher for Sigma VLA experiments.
+Patch goals:
+1) LeRobot PI05Policy (modeling_pi05.py):
+   1.1 If ckpt omits embed_tokens.weight, tie embed_tokens.weight to lm_head.weight
+       *after* load_state_dict runs.
+   1.2 Ensure torch is imported if target file lacks it.
+   1.3 Downgrade the "incorrect transformer version" hard guard
+       (ValueError) to a WARNING so new GPU environments don't crash.
+       IMPORTANT: preserve indentation and patch only the intended guard.
+2) LeRobot policies __init__ (lerobot/policies/__init__.py):
+   2.1 Make ONLY Groot/Diffusers-related imports optional (wrapped in try/except),
+       leaving all other exports untouched.
+       This prevents errors like: No module named 'triton.ops'
+       or diffusers/peft chain issues on fresh GPUs.
+3) eval_sigma_vla_rollout.py (your /workspace eval script):
+   3.1 Force strict=False for PI05Policy.from_pretrained calls:
+       - strict=True -> strict=False
+       - if a PI05Policy load call has no strict arg, add strict=False
+   3.2 Ensure randomized subset evaluation is possible:
+       - add --shuffle arg if missing
+       - change DataLoader shuffle=False -> shuffle=getattr(args,"shuffle",False)
+Safe to run multiple times; no-op if already patched.
+"""
+import os
+import re
+import sys
+import pathlib
+from typing import Optional, Tuple, List
+# -------------------------
+# Utilities
+# -------------------------
+def _read_text(p: pathlib.Path) -> str:
+    return p.read_text(encoding="utf-8")
+def _write_text(p: pathlib.Path, s: str) -> None:
+    p.write_text(s, encoding="utf-8")
+def _search_file(
+    roots: List[os.PathLike],
+    filename: str,
+    must_contain: Optional[str] = None
+) -> Optional[pathlib.Path]:
+    for r in roots:
+        r = pathlib.Path(r)
+        if not r.exists():
+            continue
+        for p in r.rglob(filename):
+            if must_contain and must_contain not in str(p):
+                continue
+            return p
+    return None
+def _default_roots():
+    return [
+        "/workspace/lerobot/src",
+        "/workspace/lerobot",
+        pathlib.Path(sys.prefix)
+        / "lib"
+        / f"python{sys.version_info.major}.{sys.version_info.minor}"
+        / "site-packages",
+    ]
+# -------------------------
+# Patch 1: PI05Policy (LeRobot)
+# -------------------------
+def find_pi05_file() -> pathlib.Path:
+    env = os.getenv("PI05_FILE")
+    if env:
+        p = pathlib.Path(env)
+        if p.exists():
+            return p
+    p = _search_file(_default_roots(), "modeling_pi05.py", must_contain="/pi05/")
+    if p and p.exists():
+        return p
+    raise FileNotFoundError("modeling_pi05.py not found. Set PI05_FILE env var to its path.")
+def ensure_torch_import(s: str) -> str:
+    if re.search(r"(?m)^\s*import\s+torch\b", s) or re.search(r"(?m)^\s*from\s+torch\b", s):
+        return s
+    lines = s.splitlines(True)
+    insert_idx = 0
+    if lines and lines[0].startswith("#!"):
+        insert_idx = 1
+    # skip module docstring block if present
+    if insert_idx < len(lines) and lines[insert_idx].lstrip().startswith('"""'):
+        i = insert_idx + 1
+        while i < len(lines) and '"""' not in lines[i]:
+            i += 1
+        if i < len(lines):
+            insert_idx = i + 1
+    lines.insert(insert_idx, "import torch  # PATCH: required for embed/lm_head tying\n")
+    return "".join(lines)
+def patch_pi05_embed_tie(p: pathlib.Path) -> Tuple[bool, str]:
+    s = _read_text(p)
+    s = ensure_torch_import(s)
+    marker = "PATCH: tie embed_tokens to lm_head if ckpt omitted embed_tokens"
+    if marker in s:
+        _write_text(p, s)
+        return False, f"PI05 embed-tie patch already present: {p}"
+    pat = r"(?m)^(\s*)missing_keys,\s*unexpected_keys\s*=\s*model\.load_state_dict\(\s*remapped_state_dict\s*,\s*strict\s*=\s*strict\s*\)\s*$"
+    m = re.search(pat, s)
+    if not m:
+        _write_text(p, s)
+        return False, f"Could not find load_state_dict line to patch in PI05 file: {p}"
+    indent = m.group(1)
+    inject = (
+        f"\n{indent}# --- PATCH: tie embed_tokens to lm_head if ckpt omitted embed_tokens ---\n"
+        f"{indent}if any('embed_tokens.weight' in k for k in missing_keys):\n"
+        f"{indent}    try:\n"
+        f"{indent}        with torch.no_grad():\n"
+        f"{indent}            embed = model.model.paligemma_with_expert.paligemma.model.language_model.embed_tokens\n"
+        f"{indent}            lm_head = model.model.paligemma_with_expert.paligemma.lm_head\n"
+        f"{indent}            if embed is not None and lm_head is not None:\n"
+        f"{indent}                embed.weight = lm_head.weight  # {marker}\n"
+        f"{indent}    except Exception as _e:\n"
+        f"{indent}        print('[patch_pi05] Could not tie embed_tokens to lm_head:', _e)\n"
+    )
+    s2 = re.sub(pat, lambda mm: mm.group(0) + inject, s, count=1)
+    _write_text(p, s2)
+    return True, f"Patched PI05 embed-tie in: {p}"
+def patch_pi05_transformers_guard(p: pathlib.Path) -> Tuple[bool, str]:
+    """
+    Downgrade ONLY the PI05 hard guard:
+      ValueError: An incorrect transformer version is used...
+    to WARNING print, preserving indentation.
+    Strategy:
+    - Find raise ValueError(msg) from None lines.
+    - Only patch the one whose nearby context contains
+      "incorrect transformer version".
+    """
+    s = _read_text(p)
+    marker = "PATCH: downgrade transformer version guard"
+    if marker in s:
+        return False, f"PI05 transformers-guard patch already present: {p}"
+    if "incorrect transformer version" not in s:
+        return False, f"No transformers guard message found to patch in: {p}"
+    lines = s.splitlines(True)
+    raise_pat = re.compile(r"^(\s*)raise\s+ValueError\(\s*msg\s*\)\s*from\s*None\s*$")
+    target_idx = None
+    target_indent = ""
+    for i, line in enumerate(lines):
+        m = raise_pat.match(line)
+        if not m:
+            continue
+        # look back a few lines for the specific guard text
+        window_start = max(0, i - 8)
+        window = "".join(lines[window_start:i+1]).lower()
+        if "incorrect transformer version" in window:
+            target_idx = i
+            target_indent = m.group(1)
+            break
+    if target_idx is None:
+        return False, f"Guard raise line with context not found in: {p}"
+    repl = (
+        f"{target_indent}# --- PATCH: downgrade transformer version guard ---\n"
+        f"{target_indent}print('[patch_pi05] WARNING:', msg)  # {marker}\n"
+        f"{target_indent}# continues execution despite version mismatch\n"
+    )
+    lines[target_idx] = repl
+    s2 = "".join(lines)
+    _write_text(p, s2)
+    return True, f"Patched PI05 transformers guard (raise->warn) in: {p}"
+# -------------------------
+# Patch 2: LeRobot policies optional imports
+# -------------------------
+def find_policies_init() -> pathlib.Path:
+    env = os.getenv("POLICIES_INIT_FILE")
+    if env:
+        p = pathlib.Path(env)
+        if p.exists():
+            return p
+    p = _search_file(_default_roots(), "__init__.py", must_contain="/lerobot/policies/")
+    if p and p.exists():
+        return p
+    raise FileNotFoundError("lerobot/policies/__init__.py not found. Set POLICIES_INIT_FILE env var.")
+def patch_policies_optional_imports(p: pathlib.Path) -> Tuple[bool, str]:
+    """
+    Make ONLY Groot/Diffusers imports optional.
+    This avoids wrapping unrelated exports/imports.
+    """
+    s = _read_text(p)
+    marker = "PATCH: optional Groot/Diffusers imports"
+    if marker in s:
+        return False, f"Policies optional-import patch already present: {p}"
+    lines = s.splitlines(True)
+    def is_groot_line(line: str) -> bool:
+        # strict filter: only lines that import groot submodule
+        return bool(re.search(r"^\s*from\s+\.\s*groot\b|^\s*from\s+\.groot\b|^\s*import\s+.*\bgroot\b", line))
+    idxs = [i for i, l in enumerate(lines) if is_groot_line(l)]
+    if not idxs:
+        return False, f"No Groot imports found to wrap in: {p}"
+    # group consecutive indices
+    groups = []
+    start = prev = idxs[0]
+    for i in idxs[1:]:
+        if i == prev + 1:
+            prev = i
+        else:
+            groups.append((start, prev))
+            start = prev = i
+    groups.append((start, prev))
+    new_lines = []
+    last_end = -1
+    for (a, b) in groups:
+        # copy lines before this group
+        new_lines.extend(lines[last_end + 1:a])
+        # wrap group
+        new_lines.append("# --- PATCH: optional Groot/Diffusers imports ---\n")
+        new_lines.append(f"try:  # {marker}\n")
+        for j in range(a, b + 1):
+            new_lines.append("    " + lines[j].lstrip())
+        new_lines.append("except Exception as _e:\n")
+        new_lines.append("    print('[policies_init] WARNING: optional groot deps missing:', _e)\n")
+        last_end = b
+    # copy rest
+    new_lines.extend(lines[last_end + 1:])
+    s2 = "".join(new_lines)
+    if s2 == s:
+        return False, f"Policies file unchanged after optional-import attempt: {p}"
+    _write_text(p, s2)
+    return True, f"Patched policies __init__ optional imports in: {p}"
+# -------------------------
+# Patch 3: eval_sigma_vla_rollout.py
+# -------------------------
+def find_eval_file() -> pathlib.Path:
+    env = os.getenv("EVAL_FILE")
+    if env:
+        p = pathlib.Path(env)
+        if p.exists():
+            return p
+    p = pathlib.Path("/workspace/eval_sigma_vla_rollout.py")
+    if p.exists():
+        return p
+    pp = _search_file(["/workspace", "/workspace/lerobot"], "eval_sigma_vla_rollout.py")
+    if pp and pp.exists():
+        return pp
+    raise FileNotFoundError("eval_sigma_vla_rollout.py not found. Set EVAL_FILE env var.")
+def patch_eval_force_strict_false(p: pathlib.Path) -> Tuple[bool, str]:
+    s = _read_text(p)
+    marker = "PATCH: force strict=False for PI05Policy"
+    # 1) strict=True -> strict=False in PI05 loads
+    pat_strict_true = r"(policy_cls\.from_pretrained\([^)]*strict\s*=\s*)True(\s*[^)]*\))"
+    s2, n_true = re.subn(pat_strict_true, r"\1False\2", s)
+    # 2) add strict=False if missing on PI05 loads
+    def _add_strict_false_call(match: re.Match) -> str:
+        call = match.group(0)
+        if "strict" in call:
+            return call
+        return call[:-1] + ", strict=False)"
+    pat_no_strict_1 = r"policy_cls\.from_pretrained\(\s*repo_id\s*,\s*token\s*=\s*hf_token\s*\)"
+    pat_no_strict_2 = r"policy_cls\.from_pretrained\(\s*pretrained_name_or_path\s*=\s*repo_id\s*,\s*token\s*=\s*hf_token\s*\)"
+    s3, n_add1 = re.subn(pat_no_strict_1, _add_strict_false_call, s2)
+    s4, n_add2 = re.subn(pat_no_strict_2, _add_strict_false_call, s3)
+    changed = (n_true + n_add1 + n_add2) > 0
+    if not changed:
+        if marker in s:
+            return False, f"Eval strict patch already present: {p}"
+        return False, f"Eval already strict=False or no PI05 strict targets found: {p}"
+    if marker not in s4:
+        # annotate the first strict=False we introduced / touched
+        s4 = s4.replace("strict=False)", f"strict=False)  # {marker}", 1)
+    _write_text(p, s4)
+    return True, f"Patched eval PI05 strict=False in: {p}"
+def patch_eval_shuffle_support(p: pathlib.Path) -> Tuple[bool, str]:
+    s = _read_text(p)
+    marker_arg = "PATCH: add --shuffle arg"
+    marker_dl = "PATCH: DataLoader shuffle uses args.shuffle"
+    changed = False
+    # 1) add CLI arg --shuffle if absent
+    if re.search(r'add_argument\(\s*["\']--shuffle["\']', s) is None:
+        # find last parser.add_argument(...) to insert after
+        arg_pat = re.compile(r"(?m)^\s*parser\.add_argument\(.+?\)\s*$")
+        matches = list(arg_pat.finditer(s))
+        if matches:
+            last = matches[-1]
+            insert_pos = last.end()
+            insert_text = (
+                "\nparser.add_argument("
+                "\"--shuffle\", action=\"store_true\", "
+                "help=\"Shuffle dataset order to sample different subsets per seed.\")"
+                f"  # {marker_arg}\n"
+            )
+            s = s[:insert_pos] + insert_text + s[insert_pos:]
+            changed = True
+    # 2) DataLoader(... shuffle=False ...) -> args.shuffle
+    if marker_dl not in s:
+        def _dl_repl(m: re.Match) -> str:
+            prefix = m.group(1)
+            return prefix + f'getattr(args, "shuffle", False)  # {marker_dl}'
+        # replace only literal shuffle=False
+        pat_dl = re.compile(r"(?s)(DataLoader\([\s\S]{0,1200}?shuffle\s*=\s*)False")
+        if pat_dl.search(s):
+            s = pat_dl.sub(_dl_repl, s, count=1)
+            changed = True
+    if changed:
+        _write_text(p, s)
+        return True, f"Patched eval shuffle support in: {p}"
+    return False, f"Eval shuffle support already present or no targets found: {p}"
+# -------------------------
+# Main
+# -------------------------
+def main():
+    changed_any = False
+    try:
+        pi05_file = find_pi05_file()
+        changed, msg = patch_pi05_embed_tie(pi05_file)
+        print(msg)
+        changed_any |= changed
+    except Exception as e:
+        print("[patch_sigma_env] PI05 embed-tie patch skipped:", e)
+    try:
+        pi05_file = find_pi05_file()
+        changed, msg = patch_pi05_transformers_guard(pi05_file)
+        print(msg)
+        changed_any |= changed
+    except Exception as e:
+        print("[patch_sigma_env] PI05 transformers-guard patch skipped:", e)
+    try:
+        policies_init = find_policies_init()
+        changed, msg = patch_policies_optional_imports(policies_init)
+        print(msg)
+        changed_any |= changed
+    except Exception as e:
+        print("[patch_sigma_env] policies __init__ patch skipped:", e)
+    try:
+        eval_file = find_eval_file()
+        changed, msg = patch_eval_force_strict_false(eval_file)
+        print(msg)
+        changed_any |= changed
+    except Exception as e:
+        print("[patch_sigma_env] Eval strict patch skipped:", e)
+    try:
+        eval_file = find_eval_file()
+        changed, msg = patch_eval_shuffle_support(eval_file)
+        print(msg)
+        changed_any |= changed
+    except Exception as e:
+        print("[patch_sigma_env] Eval shuffle patch skipped:", e)
+    if changed_any:
+        print("[patch_sigma_env] Done. Patches applied.")
+    else:
+        print("[patch_sigma_env] Done. Nothing to change (already patched).")
+if __name__ == "__main__":
+    main()

pi05_embed_tie.patch ADDED Viewed

	@@ -0,0 +1,19 @@

+diff --git a/src/lerobot/policies/pi05/modeling_pi05.py b/src/lerobot/policies/pi05/modeling_pi05.py
+index b017bbc5..d6290da6 100644
+--- a/src/lerobot/policies/pi05/modeling_pi05.py
++++ b/src/lerobot/policies/pi05/modeling_pi05.py
+@@ -989,6 +989,13 @@ class PI05Policy(PreTrainedPolicy):
+             if remap_count > 0:
+                 print(f"Remapped {remap_count} state dict keys")
+             # Load the remapped state dict into the model
+             missing_keys, unexpected_keys = model.load_state_dict(remapped_state_dict, strict=strict)
++
++            # --- FIX: tie embed_tokens to lm_head if embed_tokens missing in ckpt ---
++            if any("embed_tokens.weight" in k for k in missing_keys):
++                with torch.no_grad():
++                    embed = model.model.paligemma_with_expert.paligemma.model.language_model.embed_tokens
++                    lm_head = model.model.paligemma_with_expert.paligemma.lm_head
++                    embed.weight = lm_head.weight
+             return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+--index-url https://download.pytorch.org/whl/cu121
+--extra-index-url https://pypi.org/simple
+torch==2.5.1+cu121
+torchvision==0.20.1+cu121
+transformers==4.44.2
+accelerate==1.10.1
+peft==0.17.0
+safetensors==0.4.5
+huggingface_hub[cli,hf-transfer]==0.35.3
+datasets==4.1.1
+sentencepiece==0.2.0
+einops==0.8.0
+bitsandbytes==0.43.3
+numpy==2.0.2
+pandas==2.2.3
+pyarrow==21.0.0
+tqdm==4.66.5
+opencv-python-headless==4.10.0.84
+pillow==10.4.0
+av==15.1.0
+imageio==2.36.0
+imageio-ffmpeg==0.5.1
+hydra-core==1.3.2
+omegaconf==2.3.0
+pyyaml==6.0.2
+packaging==24.2
+python-dotenv==1.0.1
+wandb==0.21.1