Spaces:

nvidia
/

Cosmos3-Action-Viewer

Running

App Files Files Community

XinKongCosmos commited on 1 day ago

Commit

80ca707

verified ·

1 Parent(s): e861d93

Clean raw-action viewer release

Browse files

Files changed (24) hide show

cosmos-framework/cosmos_framework/data/vfm/action/action_normalization.py +0 -49
cosmos-framework/cosmos_framework/data/vfm/action/bridge_orig_lerobot_dataset.py +0 -4
cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py +4 -97
cosmos-framework/cosmos_framework/data/vfm/action/domain_utils.py +0 -3
cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset.py +2 -5
cosmos-framework/cosmos_framework/data/vfm/action/fractal.py +0 -4
cosmos-framework/cosmos_framework/data/vfm/action/libero_dataset.py +0 -611
cosmos-framework/cosmos_framework/data/vfm/action/libero_pose_utils.py +0 -69
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/bridge_orig_lerobot_backward_framewise_rot6d.json +0 -33
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/droid_lerobot_backward_framewise_rot6d.json +0 -33
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/fractal_backward_framewise_rot6d.json +0 -33
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/libero_native_frame_wise_relative_rot6d.json +0 -37
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka-dual_backward_framewise_rot6d.json +0 -33
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka_backward_framewise_rot6d.json +0 -33
cosmos-framework/cosmos_framework/data/vfm/action/robomind_franka_dataset.py +0 -3
cosmos-framework/cosmos_framework/data/vfm/action/umi_lerobot_dataset.py +1 -3
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/README.md +12 -28
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py +0 -6
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ik_solver.py +2 -2
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_action.py +68 -326
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_renderer.py +6 -96
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ur5e_robotiq_2f85.xml +0 -326
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/urdf_loader.py +0 -139
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py +7 -41

cosmos-framework/cosmos_framework/data/vfm/action/action_normalization.py DELETED Viewed

@@ -1,49 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-"""Action normalization helpers."""
-import json
-from pathlib import Path
-import numpy as np
-import torch
-from cosmos_framework.utils import log
-def load_action_stats(stats_path: str, stats_key: str = "global") -> dict[str, np.ndarray]:
-    """Load pre-computed action normalization stats from a JSON file."""
-    path = Path(stats_path)
-    if not path.exists():
-        raise FileNotFoundError(f"Action normalization stats not found at {stats_path}.")
-    log.info(f"Loading action normalization stats from {stats_path}")
-    with path.open("r") as f:
-        raw = json.load(f)
-    if stats_key in raw:
-        raw = raw[stats_key]
-        if not isinstance(raw, dict):
-            raise TypeError(f"Action normalization stats block {stats_key!r} in {stats_path} must be a dict.")
-    elif stats_key != "global":
-        raise KeyError(f"Action normalization stats block {stats_key!r} not found in {stats_path}.")
-    stat_keys = {"mean", "std", "min", "max", "q01", "q99"}
-    return {k: np.array(v, dtype=np.float32) for k, v in raw.items() if k in stat_keys}
-def normalize_action(
-    action: torch.Tensor,
-    method: str,
-    stats: dict[str, torch.Tensor],
-) -> torch.Tensor:
-    """Normalize action tensor (all dimensions including gripper)."""
-    if method == "quantile":
-        q01, q99 = stats["q01"], stats["q99"]
-        denom = (q99 - q01).clamp(min=1e-8)
-        return (2.0 * (action - q01) / denom - 1.0).clamp(-1.0, 1.0)
-    if method == "meanstd":
-        return (action - stats["mean"]) / stats["std"].clamp(min=1e-8)
-    if method == "minmax":
-        lo, hi = stats["min"], stats["max"]
-        denom = (hi - lo).clamp(min=1e-8)
-        return (2.0 * (action - lo) / denom - 1.0).clamp(-1.0, 1.0)
-    raise ValueError(f"Unknown normalization method: {method!r}")

cosmos-framework/cosmos_framework/data/vfm/action/bridge_orig_lerobot_dataset.py CHANGED Viewed

@@ -18,7 +18,6 @@ from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
-    ActionNormalization,
     ActionSpec,
     BaseActionLeRobotDataset,
     Gripper,
@@ -101,7 +100,6 @@ class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset):
         split: str = "train",
         mode: str = "policy",
         pose_convention: PoseConvention = "backward_framewise",
-        action_normalization: ActionNormalization | None = None,
         viewpoint: Viewpoint = "ego_view",
         enable_fast_init: bool = False,
     ) -> None:
@@ -117,7 +115,6 @@ class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset):
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
-            action_normalization=action_normalization,
             tolerance_s=1e-4,
             enable_fast_init=enable_fast_init,
         )
@@ -208,7 +205,6 @@ class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset):
     # ------------------------------------------------------------------
     # Normalization is handled by BaseActionLeRobotDataset.
     # Stats are loaded from:
-    #   cosmos_framework/data/vfm/action/normalizers/
     #       bridge_orig_lerobot_<pose_convention>_<rotation_format>.json
     # Regenerate via ``compute_action_stats.py`` + ``debug/stats_all.sh``.
     # ------------------------------------------------------------------

 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
     ActionSpec,
     BaseActionLeRobotDataset,
     Gripper,
         split: str = "train",
         mode: str = "policy",
         pose_convention: PoseConvention = "backward_framewise",
         viewpoint: Viewpoint = "ego_view",
         enable_fast_init: bool = False,
     ) -> None:
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
             tolerance_s=1e-4,
             enable_fast_init=enable_fast_init,
         )
     # ------------------------------------------------------------------
     # Normalization is handled by BaseActionLeRobotDataset.
     # Stats are loaded from:
     #       bridge_orig_lerobot_<pose_convention>_<rotation_format>.json
     # Regenerate via ``compute_action_stats.py`` + ``debug/stats_all.sh``.
     # ------------------------------------------------------------------

cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py CHANGED Viewed

@@ -21,9 +21,8 @@ from bisect import bisect_right
 from collections import OrderedDict, defaultdict
 from collections.abc import Callable, Sequence
 from concurrent.futures import ThreadPoolExecutor
-from pathlib import Path
 from threading import Lock
-from typing import Any, ClassVar, Literal
 import huggingface_hub.constants as _hf_const
 import numpy as np
@@ -57,11 +56,6 @@ def _ensure_hf_hub_offline() -> None:
 from functools import cached_property
 from cosmos_framework.utils import log
-from cosmos_framework.data.vfm.action.action_normalization import (
-    load_action_stats,
-    normalize_action,
-)
 # Re-export the action_spec DSL from this module so that subclass datasets
 # only need a single import block (alongside ``BaseActionLeRobotDataset``).
 from cosmos_framework.data.vfm.action.action_spec import (  # noqa: F401  (re-export)
@@ -96,8 +90,6 @@ from cosmos_framework.data.vfm.action_scripts.memprofile import (
 # ---------------------------------------------------------------------------
 _LRU_VIDEO_CACHE_MAX_SIZE: int = 64
 _LRU_DATASET_MAX_LOADED: int = 32
-ActionNormalization = Literal["quantile", "quantile_rot", "meanstd", "minmax"]
-_ACTION_NORMALIZATION_CHOICES: tuple[str, ...] = ("quantile", "quantile_rot", "meanstd", "minmax")
 _decoder_cache_patched = False
@@ -290,14 +282,6 @@ class BaseActionLeRobotDataset(Dataset):
     # Applied as: R_opencv = R_native @ _to_opencv
     # Subclasses override in __init__; default is identity (no correction).
-    # Bundled normalization stats directory.  Stats are committed at
-    # ``<_NORMALIZERS_DIR>/<embodiment>_<pose>_<rotation_format>.json`` (flat
-    # layout matching the existing UMI files) and produced by
-    # ``projects/cosmos3/vfm/datasets/action/compute_action_stats.py``.
-    # Subclasses that need a different filename scheme can override
-    # :meth:`_normalizer_filename`.
-    _NORMALIZERS_DIR: ClassVar[Path] = Path(__file__).parent / "normalizers"
     def __init__(
         self,
         *,
@@ -311,7 +295,6 @@ class BaseActionLeRobotDataset(Dataset):
         viewpoint: Viewpoint,
         pose_convention: str | None = None,
         rotation_format: str | None = None,
-        action_normalization: ActionNormalization | None = None,
         tolerance_s: float = 1e-4,
         max_loaded_datasets: int = _LRU_DATASET_MAX_LOADED,
         skip_video_loading: bool = False,
@@ -326,10 +309,6 @@ class BaseActionLeRobotDataset(Dataset):
         assert sample_stride >= 1, f"sample_stride must be >= 1, got {sample_stride}"
         assert fast_init_max_workers >= 1, f"fast_init_max_workers must be >= 1, got {fast_init_max_workers}"
-        assert action_normalization is None or action_normalization in _ACTION_NORMALIZATION_CHOICES, (
-            f"action_normalization must be None or one of {_ACTION_NORMALIZATION_CHOICES}, got {action_normalization!r}"
-        )
         with rss_tracker(f"{self.__class__.__name__}.__init__", enabled=self._memprofile):
             self._fps = fps
             self._dt = 1.0 / fps
@@ -342,10 +321,6 @@ class BaseActionLeRobotDataset(Dataset):
             self._viewpoint: Viewpoint = viewpoint
             self._pose_convention = pose_convention
             self._rotation_format = rotation_format
-            self._action_normalization = action_normalization
-            # Lazy-loaded stats cache, populated on first call to
-            # :meth:`_normalize_action`.  Per-process (workers get their own).
-            self._norm_stats: dict[str, torch.Tensor] | None = None
             self._tolerance_s = tolerance_s
             self._max_loaded_datasets = max_loaded_datasets
             self._skip_video_loading = skip_video_loading
@@ -728,74 +703,6 @@ class BaseActionLeRobotDataset(Dataset):
         return mode, dataset_idx, row_idx, sample
-    # -- action normalization ------------------------------------------------
-    def _normalizer_filename(self) -> str:
-        """Bundled stats filename for this dataset instance.
-        Default convention (matches ``compute_action_stats.py`` output):
-        ``<embodiment_type>[_<pose_convention>][_<rotation_format>].json``.
-        Pose/rotation suffixes are appended only when the instance actually
-        has them (SE(3) pose datasets like Bridge / DROID).  Joint-space
-        datasets — where both are ``None`` — resolve to just
-        ``<embodiment_type>.json``.
-        Subclasses may override when the bundled filename uses a different
-        scheme (e.g. UMI's ``uva_umi_single_task_normalizer.json``).
-        """
-        if not self._embodiment_type:
-            raise RuntimeError(
-                f"{self.__class__.__name__}: embodiment_type is not set; cannot resolve normalizer filename."
-            )
-        parts = [self._embodiment_type]
-        if self._pose_convention:
-            parts.append(self._pose_convention)
-        if self._rotation_format:
-            parts.append(self._rotation_format)
-        return "_".join(parts) + ".json"
-    def _normalizer_path(self) -> Path:
-        """Full path to the bundled stats JSON for this dataset."""
-        return self._NORMALIZERS_DIR / self._normalizer_filename()
-    def _load_norm_stats(self) -> dict[str, torch.Tensor]:
-        """Lazy-load action normalization stats (once per worker process).
-        Raises :class:`FileNotFoundError` if the stats file is missing.  This
-        is intentional — silently falling back to identity normalization when
-        the user asked for ``quantile`` / ``quantile_rot`` / ``meanstd`` /
-        ``minmax`` would be a training bug.
-        """
-        if self._norm_stats is not None:
-            return self._norm_stats
-        stats_key = "global_raw" if self._action_normalization == "quantile_rot" else "global"
-        raw = load_action_stats(str(self._normalizer_path()), stats_key=stats_key)
-        self._norm_stats = {}
-        for key, value in raw.items():
-            self._norm_stats[key] = torch.from_numpy(value).float()  # [D]
-        return self._norm_stats
-    def _normalize_action(self, action: torch.Tensor) -> torch.Tensor:
-        """Apply the configured normalization, or return the raw action.
-        - ``action_normalization=None`` → pass-through (used by viewer / debug)
-        - ``"quantile"``   → ``2·(x − q01) / (q99 − q01) − 1`` clamped to [-1, 1]
-        - ``"quantile_rot"`` → same as ``"quantile"``, but using ``global_raw``
-          stats so rotation dimensions are normalized too.
-        - ``"meanstd"``    → ``(x − mean) / std``
-        - ``"minmax"``     → ``2·(x − min) / (max − min) − 1`` clamped to [-1, 1]
-        """
-        if self._action_normalization is None:
-            return action
-        method = "quantile" if self._action_normalization == "quantile_rot" else self._action_normalization
-        normalized_action = normalize_action(
-            action,
-            method,
-            self._load_norm_stats(),
-        )  # [T,D]
-        return normalized_action
     # -- video formatting ----------------------------------------------------
     def _convert_video(self, video_tchw: torch.Tensor | None) -> torch.Tensor | None:
@@ -989,9 +896,9 @@ class BaseActionLeRobotDataset(Dataset):
             if idle_frames is not None:
                 extras = {"idle_frames": idle_frames, **extras}
-        normalized_action = self._normalize_action(action)  # [T,D]
         if self._skip_video_loading:
-            result: dict[str, Any] = {"action": normalized_action}
             if "idle_frames" in extras:
                 result["idle_frames"] = extras["idle_frames"]
             return result
@@ -999,7 +906,7 @@ class BaseActionLeRobotDataset(Dataset):
         return {
             "ai_caption": ai_caption,
             "video": formatted_video,
-            "action": normalized_action,
             "conditioning_fps": torch.tensor(self._fps, dtype=torch.long),
             "mode": mode,
             "domain_id": torch.tensor(self._domain_id, dtype=torch.long),

 from collections import OrderedDict, defaultdict
 from collections.abc import Callable, Sequence
 from concurrent.futures import ThreadPoolExecutor
 from threading import Lock
+from typing import Any, Literal
 import huggingface_hub.constants as _hf_const
 import numpy as np
 from functools import cached_property
 from cosmos_framework.utils import log
 # Re-export the action_spec DSL from this module so that subclass datasets
 # only need a single import block (alongside ``BaseActionLeRobotDataset``).
 from cosmos_framework.data.vfm.action.action_spec import (  # noqa: F401  (re-export)
 # ---------------------------------------------------------------------------
 _LRU_VIDEO_CACHE_MAX_SIZE: int = 64
 _LRU_DATASET_MAX_LOADED: int = 32
 _decoder_cache_patched = False
     # Applied as: R_opencv = R_native @ _to_opencv
     # Subclasses override in __init__; default is identity (no correction).
     def __init__(
         self,
         *,
         viewpoint: Viewpoint,
         pose_convention: str | None = None,
         rotation_format: str | None = None,
         tolerance_s: float = 1e-4,
         max_loaded_datasets: int = _LRU_DATASET_MAX_LOADED,
         skip_video_loading: bool = False,
         assert sample_stride >= 1, f"sample_stride must be >= 1, got {sample_stride}"
         assert fast_init_max_workers >= 1, f"fast_init_max_workers must be >= 1, got {fast_init_max_workers}"
         with rss_tracker(f"{self.__class__.__name__}.__init__", enabled=self._memprofile):
             self._fps = fps
             self._dt = 1.0 / fps
             self._viewpoint: Viewpoint = viewpoint
             self._pose_convention = pose_convention
             self._rotation_format = rotation_format
             self._tolerance_s = tolerance_s
             self._max_loaded_datasets = max_loaded_datasets
             self._skip_video_loading = skip_video_loading
         return mode, dataset_idx, row_idx, sample
     # -- video formatting ----------------------------------------------------
     def _convert_video(self, video_tchw: torch.Tensor | None) -> torch.Tensor | None:
             if idle_frames is not None:
                 extras = {"idle_frames": idle_frames, **extras}
+        raw_action = action  # [T,D]
         if self._skip_video_loading:
+            result: dict[str, Any] = {"action": raw_action}
             if "idle_frames" in extras:
                 result["idle_frames"] = extras["idle_frames"]
             return result
         return {
             "ai_caption": ai_caption,
             "video": formatted_video,
+            "action": raw_action,
             "conditioning_fps": torch.tensor(self._fps, dtype=torch.long),
             "mode": mode,
             "domain_id": torch.tensor(self._domain_id, dtype=torch.long),

cosmos-framework/cosmos_framework/data/vfm/action/domain_utils.py CHANGED Viewed

@@ -7,16 +7,13 @@ EMBODIMENT_TO_DOMAIN_ID: dict[str, int] = {
     "no_action": 0,
     "av": 1,
     "camera_pose": 2,
-    "hand_pose": 3,
     "pusht": 4,
-    "libero": 5,
     "umi": 6,
     "bridge_orig_lerobot": 7,
     "droid_lerobot": 8,
     "robomind-franka": 8,  # Both Droid and RoboMIND-Franka are using robotiq and franka
     "embodiment_b": 9,
     "robomind-franka-dual": 12,
-    "robomind-ur": 13,
     "fractal": 20,
 }

     "no_action": 0,
     "av": 1,
     "camera_pose": 2,
     "pusht": 4,
     "umi": 6,
     "bridge_orig_lerobot": 7,
     "droid_lerobot": 8,
     "robomind-franka": 8,  # Both Droid and RoboMIND-Franka are using robotiq and franka
     "embodiment_b": 9,
     "robomind-franka-dual": 12,
     "fractal": 20,
 }

cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset.py CHANGED Viewed

@@ -11,7 +11,6 @@ from scipy.spatial.transform import Rotation as R
 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
-    ActionNormalization,
     ActionSpec,
     BaseActionLeRobotDataset,
     Gripper,
@@ -69,7 +68,6 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
         split: str = "train",
         mode: str = "policy",
         pose_convention: PoseConvention = "backward_framewise",
-        action_normalization: ActionNormalization | None = None,
         tolerance_s=2e-4,
         viewpoint: Viewpoint = "concat_view",
         use_success_only: bool = False,
@@ -93,7 +91,6 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
-            action_normalization=action_normalization,
             tolerance_s=tolerance_s,
             enable_fast_init=enable_fast_init,
         )
@@ -398,7 +395,7 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
                             axis=-1,
                         )
                     ).float()
-                    extras["history_action"] = self._normalize_action(hist_action_raw)
             if self._use_state:
                 initial_gripper = sample[_GRIPPER_STATE_FEATURE][0].unsqueeze(-1)
                 if self._is_gripper_action_flipped:
@@ -449,7 +446,7 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
                     if self._is_gripper_action_flipped:
                         hist_gripper = 1.0 - hist_gripper
                     hist_action_raw = torch.cat((hist_joint, hist_gripper), dim=-1).float()
-                    extras["history_action"] = self._normalize_action(hist_action_raw)
             if self._use_state:
                 initial_gripper = sample[_GRIPPER_STATE_FEATURE][-self._chunk_length - 1].unsqueeze(-1)
                 if self._is_gripper_action_flipped:

 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
     ActionSpec,
     BaseActionLeRobotDataset,
     Gripper,
         split: str = "train",
         mode: str = "policy",
         pose_convention: PoseConvention = "backward_framewise",
         tolerance_s=2e-4,
         viewpoint: Viewpoint = "concat_view",
         use_success_only: bool = False,
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
             tolerance_s=tolerance_s,
             enable_fast_init=enable_fast_init,
         )
                             axis=-1,
                         )
                     ).float()
+                    extras["history_action"] = hist_action_raw
             if self._use_state:
                 initial_gripper = sample[_GRIPPER_STATE_FEATURE][0].unsqueeze(-1)
                 if self._is_gripper_action_flipped:
                     if self._is_gripper_action_flipped:
                         hist_gripper = 1.0 - hist_gripper
                     hist_action_raw = torch.cat((hist_joint, hist_gripper), dim=-1).float()
+                    extras["history_action"] = hist_action_raw
             if self._use_state:
                 initial_gripper = sample[_GRIPPER_STATE_FEATURE][-self._chunk_length - 1].unsqueeze(-1)
                 if self._is_gripper_action_flipped:

cosmos-framework/cosmos_framework/data/vfm/action/fractal.py CHANGED Viewed

@@ -15,7 +15,6 @@ from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
-    ActionNormalization,
     ActionSpec,
     BaseActionLeRobotDataset,
     Gripper,
@@ -84,7 +83,6 @@ class FractalLeRobotDataset(BaseActionLeRobotDataset):
         split: str = "train",
         mode: str = "policy",
         pose_convention: PoseConvention = "backward_framewise",
-        action_normalization: ActionNormalization | None = None,
         viewpoint: Viewpoint = "ego_view",
         enable_fast_init: bool = False,
     ) -> None:
@@ -103,7 +101,6 @@ class FractalLeRobotDataset(BaseActionLeRobotDataset):
                 actions. Supports ``"backward_framewise"`` and
                 ``"backward_anchored"``. Set to ``None`` to disable action
                 construction outside image-to-video mode.
-            action_normalization: Optional bundled-stats normalization
                 (``"quantile"`` / ``"quantile_rot"`` / ``"meanstd"`` / ``"minmax"``);
                 ``None`` returns raw actions.
             viewpoint: Camera viewpoint type for this dataset.
@@ -119,7 +116,6 @@ class FractalLeRobotDataset(BaseActionLeRobotDataset):
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
-            action_normalization=action_normalization,
             tolerance_s=1e-4,
             enable_fast_init=enable_fast_init,
         )

 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
     ActionSpec,
     BaseActionLeRobotDataset,
     Gripper,
         split: str = "train",
         mode: str = "policy",
         pose_convention: PoseConvention = "backward_framewise",
         viewpoint: Viewpoint = "ego_view",
         enable_fast_init: bool = False,
     ) -> None:
                 actions. Supports ``"backward_framewise"`` and
                 ``"backward_anchored"``. Set to ``None`` to disable action
                 construction outside image-to-video mode.
                 (``"quantile"`` / ``"quantile_rot"`` / ``"meanstd"`` / ``"minmax"``);
                 ``None`` returns raw actions.
             viewpoint: Camera viewpoint type for this dataset.
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
             tolerance_s=1e-4,
             enable_fast_init=enable_fast_init,
         )

cosmos-framework/cosmos_framework/data/vfm/action/libero_dataset.py DELETED Viewed

@@ -1,611 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-"""LIBERO dataset for training from local storage, supporting multiple dataset roots."""
-import random
-from pathlib import Path
-from typing import Literal
-import torch
-import torchvision.transforms.functional as F
-from lerobot.datasets.lerobot_dataset import LeRobotDataset
-from torch.utils.data import Dataset
-from cosmos_framework.utils import log
-from cosmos_framework.data.vfm.action.action_normalization import (
-    load_action_stats,
-    normalize_action,
-)
-from cosmos_framework.data.vfm.action.action_spec import (
-    Gripper,
-    Pos,
-    Rot,
-    build_action_spec,
-)
-from cosmos_framework.data.vfm.action.domain_utils import get_domain_id
-from cosmos_framework.data.vfm.action.libero_pose_utils import (
-    libero_action_dim,
-    libero_rotation_format,
-)
-from cosmos_framework.data.vfm.action.pose_utils import (
-    compute_idle_frames,
-    convert_rotation,
-)
-LIBERO_ROOTS: list[str] = [
-    "<PATH_TO_LIBERO_10>",
-    "<PATH_TO_LIBERO_90>",
-    "<PATH_TO_LIBERO_OBJECT>",
-    "<PATH_TO_LIBERO_SPATIAL>",
-    "<PATH_TO_LIBERO_GOAL>",
-]
-class LIBERODataset(Dataset):
-    """
-    A Dataset wrapper for LeRobot LIBERO dataset(s) designed for training from local storage.
-    This dataset:
-    - Loads data from local storage using LeRobotDataset
-    - Supports multiple dataset roots that are concatenated into one dataset
-    - Supports configurable camera modes (image, wrist_image, or concat_view)
-    - Filters episodes for train/val split
-    - Filters frames at episode boundaries (to avoid padding issues with delta timestamps)
-    - Uses task descriptions from meta/tasks.parquet for ai_caption
-    """
-    _NORMALIZERS_DIR = Path(__file__).parent / "normalizers"
-    def __init__(
-        self,
-        repo_id: str | list[str] = "lerobot/libero_90",
-        root: str | list[str] | None = LIBERO_ROOTS,
-        image_size: int = 256,
-        chunk_length: int = 16,  # must be divisible by 4
-        fps: int = 10,  # IMPORTANT! LIBERO is at 20fps. If using frame_wise_relative in policy mode, we have to match the fps.
-        mode: str = "policy",
-        video_backend: str | None = "torchcodec",
-        download_videos: bool = False,
-        force_cache_sync: bool = False,
-        tolerance_s: float = 1e-4,
-        split: str = "train",
-        val_ratio: float = 0.01,
-        seed: int = 0,
-        # Camera configuration
-        camera_mode: str = "image",  # 'image', 'wrist_image', or 'concat_view'
-        # Action configuration
-        action_space: str = "frame_wise_relative",  # "absolute" or "relative" or "frame_wise_relative"
-        # rotation_space
-        rotation_space: Literal["9d", "6d", "3d"] = "3d",
-        # Native simulator frame or shared OpenCV-style EE frame used by midtraining.
-        pose_coordinate_frame: Literal["native", "opencv"] = "native",
-        # domain-aware configuration
-        embodiment_type: str = "libero",
-        action_normalization: Literal["quantile", "quantile_rot", "meanstd", "minmax"] | None = None,
-        action_stats_path: str | None = None,
-        skip_video_loading: bool = False,
-    ):
-        super().__init__()
-        self._embodiment_type = embodiment_type
-        self.domain_id = get_domain_id(embodiment_type)
-        self.image_size = image_size
-        self.chunk_length = chunk_length
-        assert self.chunk_length % 4 == 0, "chunk_length must be divisible by 4"
-        self.fps = fps
-        self.mode = mode
-        self.split = split.lower().strip()
-        self.val_ratio = val_ratio
-        self.seed = seed
-        self.camera_mode = camera_mode.lower().strip()
-        self.action_space = action_space
-        self.action_normalization = action_normalization
-        self.rotation_space = rotation_space.lower().strip()
-        self.pose_coordinate_frame = pose_coordinate_frame
-        self._pose_convention = self.action_space
-        self._rotation_format = libero_rotation_format(self.rotation_space)
-        # When True, skip video decoding entirely: drop image keys from
-        # delta_timestamps so LeRobot never touches the mp4, and return
-        # ``video=None`` in __getitem__. Must be set at construction time
-        # because LeRobotDataset is eagerly built in __init__.
-        self._skip_video_loading = bool(skip_video_loading)
-        # Load action normalization stats. ``action_min`` / ``action_range`` are
-        # retained for older LIBERO eval code that knows how to invert a
-        # range-style [-1, 1] normalization.
-        self._norm_stats: dict[str, torch.Tensor] | None = None
-        self.action_min: torch.Tensor | None = None
-        self.action_max: torch.Tensor | None = None
-        self.action_range: torch.Tensor | None = None
-        if self.action_normalization is not None:
-            stats_path = self._resolve_action_stats_path(action_stats_path)
-            stats_key = "global_raw" if self.action_normalization == "quantile_rot" else "global"
-            raw_stats = load_action_stats(str(stats_path), stats_key=stats_key)
-            self._norm_stats = {}
-            for key, value in raw_stats.items():
-                self._norm_stats[key] = torch.from_numpy(value).float()  # [D]
-            self._set_range_denormalization_stats()
-            log.info(
-                f"Loaded LIBERO action stats from {stats_path} with action_normalization={self.action_normalization}"
-            )
-        # Validate camera mode
-        if self.camera_mode not in {"image", "wrist_image", "concat_view"}:
-            raise ValueError(f"Unsupported camera_mode={camera_mode!r}. Use 'image', 'wrist_image', or 'concat_view'.")
-        # Validate split
-        if self.split not in {"train", "val", "valid", "validation", "eval", "test", "full"}:
-            raise ValueError(f"Unsupported {split=}. Use train/val/full.")
-        # Build delta timestamps based on camera mode
-        dt = 1.0 / self.fps
-        if self.fps != 20:
-            log.warning(
-                f"LIBERO is at 20fps. If using frame_wise_relative for policy mode training, we have to match the fps. fps={self.fps}"
-            )
-        # Determine which image keys to use
-        if self.camera_mode == "image":
-            self.image_keys = ["observation.images.image"]
-        elif self.camera_mode == "wrist_image":
-            self.image_keys = ["observation.images.wrist_image"]
-        else:  # concat_view
-            self.image_keys = ["observation.images.image", "observation.images.wrist_image"]
-        # Build delta_timestamps for all keys (same convention as PushT: 0 to chunk_length)
-        self.delta_timestamps: dict[str, list[float]] = {}
-        if not self._skip_video_loading:
-            for key in self.image_keys:
-                self.delta_timestamps[key] = [i * dt for i in range(0, chunk_length + 1)]
-        self.delta_timestamps["observation.state"] = [i * dt for i in range(0, chunk_length + 1)]
-        self.delta_timestamps["action"] = [i * dt for i in range(0, chunk_length + 1)]
-        # Normalize repo_id and root to lists
-        repo_id_list: list[str] = [repo_id] if isinstance(repo_id, str) else list(repo_id)
-        root_list: list[str | None]
-        if root is None:
-            root_list = [None for _ in repo_id_list]
-        elif isinstance(root, str):
-            root_list = [root]
-        else:
-            root_list = [r for r in root]
-        if len(repo_id_list) != len(root_list):
-            raise ValueError(
-                f"Length mismatch: repo_id has {len(repo_id_list)} items, root has {len(root_list)} items."
-            )
-        # Load all datasets
-        self.datasets: list[LeRobotDataset] = []
-        self.tasks_dfs: list = []  # Store tasks DataFrames for each dataset
-        for rid, r in zip(repo_id_list, root_list):
-            dataset = LeRobotDataset(
-                repo_id=rid,
-                root=r,
-                delta_timestamps=self.delta_timestamps,  # type: ignore
-                tolerance_s=tolerance_s,
-                force_cache_sync=force_cache_sync,
-                download_videos=download_videos,
-                video_backend=video_backend,
-                episodes=None,  # Load full dataset, filter later
-            )
-            self.datasets.append(dataset)
-            self.tasks_dfs.append(dataset.meta.tasks)
-        # Build index mapping: list of (dataset_idx, local_idx) for valid frames
-        self.index_map: list[tuple[int, int, int]] = []  # (dataset_idx, local_idx, episode_idx)
-        self._episode_boundaries: list[dict[int, tuple[int, int]]] = []
-        self._episode_splits: list[tuple[set[int], set[int]]] = []
-        total_episodes = 0
-        total_frames = 0
-        for ds_idx, dataset in enumerate(self.datasets):
-            # Compute episode splits for this dataset
-            train_eps, val_eps = self._compute_episode_splits_for_dataset(dataset)
-            self._episode_splits.append((train_eps, val_eps))
-            # Get episodes for current split
-            split_episodes = self._get_split_episodes_for_dataset(ds_idx)
-            # Build episode boundaries
-            boundaries = self._build_episode_boundaries_for_dataset(dataset)
-            self._episode_boundaries.append(boundaries)
-            # Filter indices
-            indices = self._filter_indices_for_dataset(ds_idx, dataset, split_episodes, boundaries)
-            self.index_map.extend(indices)
-            total_episodes += dataset.num_episodes
-            total_frames += len(dataset)
-        log.info(
-            f"Loaded LIBERO dataset with {len(repo_id_list)} source(s) split={self.split!r} "
-            f"camera_mode={self.camera_mode!r} "
-            f"total_episodes={total_episodes} "
-            f"total_frames={total_frames} "
-            f"valid_indices={len(self.index_map)}"
-        )
-    def _compute_episode_splits_for_dataset(self, dataset: LeRobotDataset) -> tuple[set[int], set[int]]:
-        """Compute train/val episode splits deterministically for a single dataset."""
-        total_episodes = int(dataset.meta.total_episodes)
-        if not (0.0 < self.val_ratio < 1.0):
-            raise ValueError(f"{self.val_ratio=} must be in (0, 1).")
-        n_val = max(1, int(round(total_episodes * self.val_ratio)))
-        # val_eps = set(range(n_val))
-        # train_eps = set(range(n_val, total_episodes))
-        # Yihuai: Randomly select validation episodes instead of the first n_val episodes (otherwise task will be repeated)
-        rng = random.Random(self.seed)  # To ensure validation episodes are the same on all ranks
-        val_eps = set(rng.sample(range(total_episodes), n_val))
-        train_eps = set(range(total_episodes)) - val_eps
-        log.info(f"train_eps={train_eps}, val_eps={val_eps}")
-        return train_eps, val_eps
-    def _get_split_episodes_for_dataset(self, ds_idx: int) -> set[int]:
-        """Get the episode set for the current split for a specific dataset."""
-        train_eps, val_eps = self._episode_splits[ds_idx]
-        if self.split in {"val", "valid", "validation", "eval", "test"}:
-            return val_eps
-        elif self.split == "train":
-            return train_eps
-        else:  # full
-            return train_eps | val_eps
-    def _build_episode_boundaries_for_dataset(self, dataset: LeRobotDataset) -> dict[int, tuple[int, int]]:
-        """Build a dict of episode_index -> (start_frame, end_frame) for a single dataset."""
-        boundaries: dict[int, tuple[int, int]] = {}
-        for ep in dataset.meta.episodes:
-            ep_idx = int(ep["episode_index"])  # type: ignore[index]
-            start = int(ep["dataset_from_index"])  # type: ignore[index]
-            end = int(ep["dataset_to_index"])  # type: ignore[index]
-            boundaries[ep_idx] = (start, end)
-        return boundaries
-    def _filter_indices_for_dataset(
-        self,
-        ds_idx: int,
-        dataset: LeRobotDataset,
-        split_episodes: set[int],
-        boundaries: dict[int, tuple[int, int]],
-    ) -> list[tuple[int, int, int]]:
-        """Filter valid indices for a single dataset, returning (dataset_idx, local_idx, episode_idx)."""
-        index_map: list[tuple[int, int, int]] = []
-        all_meta = list(dataset.meta.episodes)
-        for ep_idx in split_episodes:
-            if ep_idx >= len(all_meta):
-                continue
-            ep = all_meta[ep_idx]
-            ep_start = int(ep["dataset_from_index"])  # type: ignore[index]
-            ep_end = int(ep["dataset_to_index"])  # type: ignore[index]
-            # Valid range: [start, end - chunk_length - 1] inclusive
-            # We drop chunk_length frames at end to ensure we can query up to delta=chunk_length.
-            start = ep_start
-            end = ep_end - self.chunk_length - 1
-            if end >= start:
-                for local_idx in range(start, end + 1):
-                    index_map.append((ds_idx, local_idx, ep_idx))
-        return index_map
-    def __len__(self) -> int:
-        return len(self.index_map)
-    def _get_task_description(self, ds_idx: int, item: dict) -> str:
-        """Get task description for the current item from meta/tasks.parquet.
-        The tasks.parquet has task descriptions as the DataFrame index (row labels)
-        and task_index as an integer column. We look up by task_index and return
-        the corresponding index name (the actual task description string).
-        """
-        task_idx = item.get("task_index")
-        if task_idx is not None:
-            if isinstance(task_idx, torch.Tensor):
-                task_idx = task_idx.item()
-            task_idx = int(task_idx)
-            tasks_df = self.tasks_dfs[ds_idx]
-            if task_idx in tasks_df["task_index"].values:
-                row = tasks_df[tasks_df["task_index"] == task_idx].iloc[0]
-                # The task description is the index name (row label), not a column value
-                return str(row.name)
-        raise ValueError(f"Task index {task_idx} not found in tasks.parquet for dataset {ds_idx}")
-    def _compute_anchored_actions(
-        self,
-        state_raw: torch.Tensor,
-        action_raw: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Compute anchored relative actions (batched).
-        Converts frame-wise relative actions to anchored relative actions where each
-        action[t] represents the target pose (after applying action[t] to state[t])
-        expressed in state 0's local coordinate frame.
-        Mathematical formulation:
-        1. Compute target in world frame (LIBERO convention):
-           - p_{t+1} = p_t + delta_p[t]  (position addition in world frame)
-           - R_{t+1} = R_delta[t] @ R_t  (rotation composition, delta first)
-        2. Compute anchored (left-multiply by T_0^{-1}):
-           - anchored_pos[t] = R_0^T @ (p_{t+1} - p_0)
-           - anchored_rot[t] = R_0^T @ R_{t+1}
-        Args:
-            state_raw: State tensor of shape (T+1, 8): [x, y, z, ax, ay, az, grip1, grip2]
-                where (ax, ay, az) is axis-angle rotation.
-            action_raw: Action tensor of shape (T+1, 7): [dx, dy, dz, dax, day, daz, grip]
-                where (dax, day, daz) is axis-angle rotation delta.
-        Returns:
-            anchored_translation: (T, 3) - position in state_0's local frame
-            anchored_rotation_9d: (T, 9) - rotation relative to state_0 as flattened 3x3 matrix
-            gripper: (T, 1) - original gripper commands (unchanged)
-        """
-        # Extract positions and rotations from states
-        p_states = state_raw[:, :3]  # [T+1,3]
-        rotvec_states = state_raw[:, 3:6]  # [T+1,3] - axis-angle
-        # Extract deltas from actions (use first T actions)
-        delta_p = action_raw[:-1, :3]  # [T,3]
-        delta_rotvec = action_raw[:-1, 3:6]  # [T,3] - axis-angle delta
-        gripper = action_raw[:-1, 6:7]  # [T,1]
-        # Convert all axis-angle to rotation matrices (batched)
-        R_states = convert_rotation(rotvec_states, input_format="axisangle", output_format="matrix")  # [T+1,3,3]
-        R_deltas = convert_rotation(delta_rotvec, input_format="axisangle", output_format="matrix")  # [T,3,3]
-        # Initial pose (state 0)
-        p_0 = p_states[0]  # [3]
-        R_0 = R_states[0]  # [3,3]
-        R_0_T = R_0.T  # [3,3] - transpose for inverse rotation
-        # Current states for t = 0..T-1
-        p_t = p_states[:-1]  # [T,3]
-        R_t = R_states[:-1]  # [T,3,3]
-        # Step 1: Compute target poses in world frame (LIBERO convention)
-        # p_target = p_t + delta_p
-        p_target = p_t + delta_p  # [T,3]
-        # R_target = R_delta @ R_t (batched matrix multiply)
-        R_target = torch.bmm(R_deltas, R_t)  # [T,3,3]
-        # Step 2: Compute anchored (in state_0's local frame)
-        # anchored_p = R_0^T @ (p_target - p_0)
-        displacement = p_target - p_0  # [T,3]
-        anchored_p = (R_0_T @ displacement.T).T  # [T,3]
-        # anchored_R = R_0^T @ R_target (batched)
-        R_0_T_expanded = R_0_T.unsqueeze(0).expand(R_target.shape[0], -1, -1)  # [T,3,3]
-        anchored_R = torch.bmm(R_0_T_expanded, R_target)  # [T,3,3]
-        return anchored_p, anchored_R, gripper
-    def _convert_rotation_to_repr(self, rotation_matrix: torch.Tensor) -> torch.Tensor:
-        """Convert rotation matrix to the desired representation.
-        Args:
-            rotation_matrix: Rotation matrices of shape (T, 3, 3).
-        Returns:
-            Rotation in the configured ``rotation_space`` format.
-        """
-        return convert_rotation(rotation_matrix, "matrix", libero_rotation_format(self.rotation_space))
-    def _normalizer_filename(self) -> str:
-        rotation_suffix = {
-            "3d": "3d",
-            "6d": "rot6d",
-            "9d": "rot9d",
-        }.get(self.rotation_space)
-        if rotation_suffix is None:
-            raise ValueError(f"Unsupported rotation_space={self.rotation_space!r}.")
-        action_space = self.action_space.replace("-", "_")
-        return f"{self._embodiment_type}_{action_space}_{rotation_suffix}.json"
-    def _resolve_action_stats_path(self, action_stats_path: str | None) -> Path:
-        if action_stats_path is None:
-            stats_path = self._NORMALIZERS_DIR / self._normalizer_filename()
-            if stats_path.exists():
-                return stats_path
-            raise FileNotFoundError(
-                f"Could not find bundled LIBERO action stats at {stats_path}. "
-                "Pass action_stats_path explicitly or regenerate stats with compute_action_stats.py."
-            )
-        stats_path = Path(action_stats_path)
-        if stats_path.is_absolute():
-            if stats_path.exists():
-                return stats_path
-            raise FileNotFoundError(f"Could not find action_stats_path={action_stats_path!r}.")
-        module_dir = Path(__file__).resolve().parent
-        candidates: list[Path] = []
-        for parent in module_dir.parents:
-            candidates.append(parent / stats_path)
-        candidates.append(self._NORMALIZERS_DIR / stats_path.name)
-        candidates.append(module_dir / stats_path.name)
-        for candidate in candidates:
-            if candidate.exists():
-                return candidate
-        raise FileNotFoundError(
-            f"Could not resolve action_stats_path={action_stats_path!r}; tried: {[str(c) for c in candidates]}"
-        )
-    def _set_range_denormalization_stats(self) -> None:
-        if self._norm_stats is None:
-            return
-        if self.action_normalization == "minmax":
-            lo_key, hi_key = "min", "max"
-        elif self.action_normalization in ("quantile", "quantile_rot"):
-            lo_key, hi_key = "q01", "q99"
-        else:
-            return
-        if lo_key not in self._norm_stats or hi_key not in self._norm_stats:
-            raise ValueError(
-                f"Action stats for {self.action_normalization!r} normalization require "
-                f"{lo_key!r} and {hi_key!r} entries."
-            )
-        self.action_min = self._norm_stats[lo_key]  # [D]
-        self.action_max = self._norm_stats[hi_key]  # [D]
-        action_range = self.action_max - self.action_min  # [D]
-        self.action_range = torch.clamp(action_range, min=1e-6)  # [D]
-    def __getitem__(self, idx: int, _retry_count: int = 0) -> dict[str, torch.Tensor | str]:
-        """Get a single item from the dataset."""
-        max_retries = 10
-        ds_idx, local_idx, ep_idx = self.index_map[idx]
-        dataset = self.datasets[ds_idx]
-        try:
-            item = dataset[local_idx]
-        except Exception as e:
-            log.warning(
-                f"Error loading item (retry {_retry_count}/{max_retries}): idx={idx}, ds_idx={ds_idx}, "
-                f"local_idx={local_idx}, ep_idx={ep_idx}, repo_id={dataset.meta.repo_id}, error={e}"
-            )
-            if _retry_count >= max_retries:
-                raise RuntimeError(f"Failed to load data after {max_retries} retries") from e
-            new_idx = random.randint(0, len(self) - 1)
-            return self.__getitem__(new_idx, _retry_count + 1)
-        if self.mode == "joint":
-            mode = random.choice(["forward_dynamics", "inverse_dynamics", "policy", "image2video"])
-        else:
-            mode = self.mode
-        # Get task description for ai_caption
-        task_description = self._get_task_description(ds_idx, item)
-        # Process video based on camera mode (skipped entirely when
-        # skip_video_loading=True; image keys are also absent from
-        # delta_timestamps so LeRobot never decoded them).
-        video: torch.Tensor | None
-        if self._skip_video_loading:
-            video = None
-        else:
-            if self.camera_mode == "concat_view":
-                # Load both cameras and concatenate horizontally
-                video_1: torch.Tensor = item["observation.images.image"]
-                video_2: torch.Tensor = item["observation.images.wrist_image"]
-                # Resize each if needed
-                if video_1.shape[-1] != self.image_size or video_1.shape[-2] != self.image_size:
-                    video_1 = F.resize(video_1, [self.image_size, self.image_size])
-                if video_2.shape[-1] != self.image_size or video_2.shape[-2] != self.image_size:
-                    video_2 = F.resize(video_2, [self.image_size, self.image_size])
-                # Concatenate along width dimension (last dim for TCHW)
-                video_tchw = torch.cat([video_1, video_2], dim=-1)  # (T, C, H, W*2)
-            else:
-                # Single camera mode
-                image_key = self.image_keys[0]
-                video_tchw = item[image_key]
-                # Resize if needed
-                if video_tchw.shape[-1] != self.image_size or video_tchw.shape[-2] != self.image_size:
-                    video_tchw = F.resize(video_tchw, [self.image_size, self.image_size])
-            # Convert to uint8 and transpose to (C, T, H, W)
-            video = (video_tchw * 255).clamp(0, 255).to(torch.uint8).permute(1, 0, 2, 3)
-        # Action (raw): LIBERO actions are 7D (6 DoF + gripper)
-        action_raw: torch.Tensor = item["action"]
-        # State (raw): LIBERO state is 8D (6 DoF + 2 gripper states)
-        state_raw: torch.Tensor = item["observation.state"]
-        # Action: (T+1, D) -> (T, D)
-        # Take all but last action
-        # LIBERO action format: [x, y, z, ax, ay, az, gripper] (7D) where (ax,ay,az) is axis-angle
-        if self.action_space == "relative":
-            # Compute anchored relative actions
-            # Returns: translation (T, 3), rotation_matrix (T, 3, 3), gripper (T, 1)
-            translation, rotation_matrix, gripper = self._compute_anchored_actions(state_raw, action_raw.clone())
-        elif self.action_space == "frame_wise_relative":
-            action = action_raw[:-1].clone()  # [T,7]
-            translation = action[:, :3]  # [T,3]
-            rotation_rotvec = action[:, 3:6]  # [T,3]
-            gripper = action[:, 6:]  # [T,1]
-            rotation_matrix = convert_rotation(
-                rotation_rotvec, input_format="axisangle", output_format="matrix"
-            )  # [T,3,3]
-        else:
-            raise ValueError(f"Unsupported action space: {self.action_space}")
-        rotation = self._convert_rotation_to_repr(rotation_matrix)  # [T,rot_dim]
-        action = torch.cat([translation, rotation, gripper], dim=-1)  # [T,action_dim]
-        # Compute idle_frames from the raw (un-normalized) action, only when the
-        # action layout has correct per-frame idle semantics (frame_wise_relative
-        # ⇔ backward_framewise). The other action_spaces ("relative",
-        # "absolute") encode per-frame motion differently and would not give
-        # meaningful idle counts under the same threshold check.
-        idle_frames: torch.Tensor | None = None
-        if self.action_space == "frame_wise_relative":
-            try:
-                spec = build_action_spec(Pos(), Rot(libero_rotation_format(self.rotation_space)), Gripper())
-                n = compute_idle_frames(action, spec)
-                idle_frames = torch.tensor(n, dtype=torch.long)
-            except (ValueError, TypeError):
-                idle_frames = None
-        if self.action_normalization is not None and self._norm_stats is not None and self.action_min is not None:
-            if action.shape[-1] != self.action_min.shape[0]:
-                raise ValueError(
-                    f"Action dimension {action.shape[-1]} does not match stats dimension "
-                    f"{self.action_min.shape[0]}. Recompute stats for the current "
-                    f"rotation_space={self.rotation_space!r} and action_space={self.action_space!r}."
-                )
-            method = "quantile" if self.action_normalization == "quantile_rot" else self.action_normalization
-            action = normalize_action(action, method, self._norm_stats)  # [T,D]
-        # Index
-        key = torch.tensor([local_idx], dtype=torch.long)
-        if self.camera_mode == "image":
-            viewpoint = "third_person_view"
-        elif self.camera_mode == "wrist_image":
-            viewpoint = "wrist_view"
-        else:
-            viewpoint = "concat_view"
-        result: dict[str, torch.Tensor | str] = {
-            "source_repo_id": dataset.meta.repo_id,
-            "video": video,
-            "action": action,
-            "action_raw": action_raw,
-            "conditioning_fps": torch.tensor(self.fps, dtype=torch.long),
-            "prompt": task_description,
-            "ai_caption": task_description,
-            "mode": mode,
-            "state": state_raw,
-            "action_space": self.action_space,
-            "rotation_space": self.rotation_space,
-            "pose_coordinate_frame": self.pose_coordinate_frame,
-            "__key__": key,
-            "domain_id": torch.tensor(self.domain_id, dtype=torch.long),
-            "viewpoint": viewpoint,
-        }
-        if idle_frames is not None:
-            result["idle_frames"] = idle_frames
-        if self.camera_mode == "concat_view" and not self._skip_video_loading:
-            result["additional_view_description"] = (
-                "The left half shows the third-person view; the right half shows the wrist-mounted camera."
-            )
-        return result
-    @property
-    def action_dim(self) -> int:
-        return libero_action_dim(self.rotation_space)

cosmos-framework/cosmos_framework/data/vfm/action/libero_pose_utils.py DELETED Viewed

@@ -1,69 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-"""Small LIBERO pose helpers shared by training and closed-loop eval."""
-from __future__ import annotations
-import numpy as np
-import torch
-from cosmos_framework.data.vfm.action.pose_utils import (
-    RotationConvention,
-    build_abs_pose_from_components,
-)
-# Same local-frame post-rotation pattern used by DROID/Bridge/Fractal:
-# R_opencv = R_native @ *_TO_OPENCV.
-LIBERO_TO_OPENCV: np.ndarray = np.array(
-    [[0.0, -1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 1.0]],
-    dtype=np.float32,
-)
-LIBERO_ROTATION_FORMATS: dict[str, RotationConvention] = {
-    "3d": "axisangle",
-    "6d": "rot6d",
-    "9d": "rot9d",
-}
-LIBERO_ACTION_DIMS: dict[str, int] = {"3d": 7, "6d": 10, "9d": 13}
-def libero_rotation_format(rotation_space: str) -> RotationConvention:
-    """Return the shared ``pose_utils`` rotation format for a LIBERO setting."""
-    rotation_format = LIBERO_ROTATION_FORMATS.get(rotation_space)
-    if rotation_format is None:
-        raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.")
-    return rotation_format
-def libero_action_dim(rotation_space: str) -> int:
-    """Return ``[xyz, rotation, gripper]`` action width for LIBERO."""
-    action_dim = LIBERO_ACTION_DIMS.get(rotation_space)
-    if action_dim is None:
-        raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.")
-    return action_dim
-def libero_rotation_space_from_action_dim(action_dim: int) -> str:
-    """Infer LIBERO rotation space from unpadded action width."""
-    for rotation_space, dim in LIBERO_ACTION_DIMS.items():
-        if dim == action_dim:
-            return rotation_space
-    raise ValueError(f"Unable to infer rotation_space from action_dim={action_dim}.")
-def build_libero_abs_pose(state_raw: torch.Tensor | np.ndarray, *, to_opencv: bool) -> np.ndarray:
-    """Build absolute LIBERO EE poses from state rows.
-    ``state_raw`` is ``[x,y,z,axisangle(3),gripper(2)]``.  When requested, the
-    local EE frame is post-rotated into the shared OpenCV-style action frame.
-    """
-    if isinstance(state_raw, torch.Tensor):
-        state_np = state_raw.detach().cpu().numpy().astype(np.float32, copy=False)
-    else:
-        state_np = np.asarray(state_raw, dtype=np.float32)
-    poses_abs = build_abs_pose_from_components(state_np[:, :3], state_np[:, 3:6], "axisangle")
-    if to_opencv:
-        poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ LIBERO_TO_OPENCV
-    return poses_abs

cosmos-framework/cosmos_framework/data/vfm/action/normalizers/bridge_orig_lerobot_backward_framewise_rot6d.json DELETED Viewed

@@ -1,33 +0,0 @@
-{
-  "metadata": {
-    "embodiment_type":    "bridge_orig_lerobot",
-    "pose_convention":    "backward_framewise",
-    "rotation_format":    "rot6d",
-    "action_dim":         10,
-    "skip_rotation_dims": [3, 4, 5, 6, 7, 8],
-    "chunk_length":       16,
-    "sample_stride":      16,
-    "dataset_name":       "bridge_20260416",
-    "dataset_class":      "BridgeOrigLeRobotDataset",
-    "dataset_root":       "",
-    "split":              "train",
-    "num_samples_stats":  83036,
-    "reservoir_size":     5000000
-  },
-  "global": {
-    "mean": [-0.000094, -0.000394,  0.001623,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.582683],
-    "std":  [ 0.013297,  0.009985,  0.012079,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.489959],
-    "min":  [-0.309451, -0.074740, -0.082767, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "max":  [ 0.127018,  0.414660,  0.493186,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000],
-    "q01":  [-0.038884, -0.028667, -0.037840, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "q99":  [ 0.039722,  0.029068,  0.026702,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000]
-  },
-  "global_raw": {
-    "mean": [-0.000094, -0.000394,  0.001623,  0.998307, -0.001371,  0.000061,  0.001414,  0.998226, -0.000154,  0.582683],
-    "std":  [ 0.013297,  0.009985,  0.012079,  0.004630,  0.050168,  0.029018,  0.050165,  0.004328,  0.031742,  0.489959],
-    "min":  [-0.309451, -0.074740, -0.082767, -0.845782, -0.636628, -0.401535, -0.590214, -0.217448, -0.979635,  0.000000],
-    "max":  [ 0.127018,  0.414660,  0.493186,  1.000000,  0.362611,  0.601211,  0.619479,  1.000000,  0.365993,  1.000000],
-    "q01":  [-0.038884, -0.028667, -0.037840,  0.976292, -0.163098, -0.081545, -0.160193,  0.976322, -0.078872,  0.000000],
-    "q99":  [ 0.039722,  0.029068,  0.026702,  1.000000,  0.160195,  0.081655,  0.163227,  1.000000,  0.095189,  1.000000]
-  }
-}

cosmos-framework/cosmos_framework/data/vfm/action/normalizers/droid_lerobot_backward_framewise_rot6d.json DELETED Viewed

@@ -1,33 +0,0 @@
-{
-  "metadata": {
-    "embodiment_type":    "droid_lerobot",
-    "pose_convention":    "backward_framewise",
-    "rotation_format":    "rot6d",
-    "action_dim":         10,
-    "skip_rotation_dims": [3, 4, 5, 6, 7, 8],
-    "chunk_length":       16,
-    "sample_stride":      16,
-    "dataset_name":       "droid_20260418",
-    "dataset_class":      "DROIDLeRobotDataset",
-    "dataset_root":       "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/droid_plus_lerobot_640x360_20260412",
-    "split":              "train",
-    "num_samples_stats":  1321153,
-    "reservoir_size":     5000000
-  },
-  "global": {
-    "mean": [-0.000017, -0.000612,  0.000568,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.588911],
-    "std":  [ 0.004539,  0.004054,  0.004999,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.441186],
-    "min":  [-0.075397, -0.057288, -0.056677, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "max":  [ 0.073107,  0.082187,  0.077080,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000],
-    "q01":  [-0.014200, -0.013416, -0.015206, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "q99":  [ 0.014515,  0.011517,  0.014520,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000]
-  },
-  "global_raw": {
-    "mean": [-0.000017, -0.000612,  0.000568,  0.999830,  0.000227, -0.000152, -0.000222,  0.999818,  0.000417,  0.588911],
-    "std":  [ 0.004539,  0.004054,  0.004999,  0.000336,  0.014924,  0.010784,  0.014927,  0.000351,  0.011903,  0.441186],
-    "min":  [-0.075397, -0.057288, -0.056677,  0.695640, -0.220599, -0.195892, -0.697421,  0.600468, -0.154176,  0.000000],
-    "max":  [ 0.073107,  0.082187,  0.077080,  1.000000,  0.698449,  0.168089,  0.220605,  1.000000,  0.391206,  1.000000],
-    "q01":  [-0.014200, -0.013416, -0.015206,  0.998459, -0.047659, -0.034774, -0.047609,  0.998428, -0.035553,  0.000000],
-    "q99":  [ 0.014515,  0.011517,  0.014520,  1.000000,  0.047596,  0.034660,  0.047654,  1.000000,  0.038888,  1.000000]
-  }
-}

cosmos-framework/cosmos_framework/data/vfm/action/normalizers/fractal_backward_framewise_rot6d.json DELETED Viewed

@@ -1,33 +0,0 @@
-{
-  "metadata": {
-    "embodiment_type":    "fractal",
-    "pose_convention":    "backward_framewise",
-    "rotation_format":    "rot6d",
-    "action_dim":         10,
-    "skip_rotation_dims": [3, 4, 5, 6, 7, 8],
-    "chunk_length":       16,
-    "sample_stride":      16,
-    "dataset_name":       "fractal_20260413",
-    "dataset_class":      "FractalLeRobotDataset",
-    "dataset_root":       "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/fractal20220817_data_no_noops",
-    "split":              "train",
-    "num_samples_stats":  166961,
-    "reservoir_size":     5000000
-  },
-  "global": {
-    "mean": [ 0.002259,  0.000721,  0.009372,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.526947],
-    "std":  [ 0.014178,  0.016428,  0.022554,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.499273],
-    "min":  [-0.151886, -0.176424, -0.194576, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "max":  [ 0.130892,  0.190835,  0.193839,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000],
-    "q01":  [-0.039816, -0.049270, -0.056266, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "q99":  [ 0.043860,  0.050352,  0.072505,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000]
-  },
-  "global_raw": {
-    "mean": [ 0.002259,  0.000721,  0.009372,  0.998347,  0.001789,  0.002694, -0.001861,  0.997811,  0.016366,  0.526947],
-    "std":  [ 0.014178,  0.016428,  0.022554,  0.003377,  0.043416,  0.037369,  0.043211,  0.004566,  0.047057,  0.499273],
-    "min":  [-0.151886, -0.176424, -0.194576,  0.520558, -0.676280, -0.822475, -0.460521,  0.736643, -0.517041,  0.000000],
-    "max":  [ 0.130892,  0.190835,  0.193839,  1.000000,  0.461026,  0.403940,  0.671708,  1.000000,  0.505528,  1.000000],
-    "q01":  [-0.039816, -0.049270, -0.056266,  0.983667, -0.134543, -0.107048, -0.126518,  0.977277, -0.091363,  0.000000],
-    "q99":  [ 0.043860,  0.050352,  0.072505,  1.000000,  0.127404,  0.107273,  0.134140,  1.000000,  0.179731,  1.000000]
-  }
-}

cosmos-framework/cosmos_framework/data/vfm/action/normalizers/libero_native_frame_wise_relative_rot6d.json DELETED Viewed

@@ -1,37 +0,0 @@
-{
-  "metadata": {
-    "embodiment_type":    "libero",
-    "pose_convention":    "frame_wise_relative",
-    "pose_coordinate_frame": "native",
-    "rotation_format":    "6d",
-    "action_dim":         10,
-    "skip_rotation_dims": [3, 4, 5, 6, 7, 8],
-    "chunk_length":       16,
-    "sample_stride":      null,
-    "dataset_name":       "libero",
-    "dataset_class":      "LIBERODataset",
-    "dataset_root":       ["outputs/libero_datasets/libero_10", "outputs/libero_datasets/libero_object", "outputs/libero_datasets/libero_spatial", "outputs/libero_datasets/libero_goal"],
-    "_comment": "Dataset paths are placeholders; the statistics values are independent of local dataset location.",
-    "split":              "train",
-    "num_samples_stats":  10000,
-    "reservoir_size":     50000,
-    "max_samples":        10000,
-    "sampling_seed":      42
-  },
-  "global": {
-    "mean": [ 0.050704,  0.097407, -0.094833,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.476725],
-    "std":  [ 0.333621,  0.387175,  0.457140,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.499460],
-    "min":  [-0.937500, -0.937500, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "max":  [ 0.937500,  0.937500,  0.937500,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000],
-    "q01":  [-0.723214, -0.808929, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "q99":  [ 0.937500,  0.870536,  0.937500,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000]
-  },
-  "global_raw": {
-    "mean": [ 0.050704,  0.097407, -0.094833,  0.994873, -0.004579, -0.004288,  0.004389,  0.996104,  0.001109,  0.476725],
-    "std":  [ 0.333621,  0.387175,  0.457140,  0.010807,  0.077802,  0.063386,  0.078571,  0.009994,  0.038504,  0.499460],
-    "min":  [-0.937500, -0.937500, -0.937500,  0.902028, -0.356085, -0.367416, -0.370434,  0.921907, -0.255000,  0.000000],
-    "max":  [ 0.937500,  0.937500,  0.937500,  1.000000,  0.368853,  0.341214,  0.356395,  1.000000,  0.348251,  1.000000],
-    "q01":  [-0.723214, -0.808929, -0.937500,  0.934955, -0.223431, -0.189878, -0.334735,  0.938516, -0.107736,  0.000000],
-    "q99":  [ 0.937500,  0.870536,  0.937500,  1.000000,  0.331000,  0.163153,  0.226216,  1.000000,  0.127158,  1.000000]
-  }
-}

cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka-dual_backward_framewise_rot6d.json DELETED Viewed

@@ -1,33 +0,0 @@
-{
-  "metadata": {
-    "embodiment_type":    "robomind-franka-dual",
-    "pose_convention":    "backward_framewise",
-    "rotation_format":    "rot6d",
-    "action_dim":         20,
-    "skip_rotation_dims": [3, 4, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18],
-    "chunk_length":       16,
-    "sample_stride":      16,
-    "dataset_name":       "robomind_franka_dual_20260414",
-    "dataset_class":      "RoboMINDFrankaDataset",
-    "dataset_root":       "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228",
-    "split":              "train",
-    "num_samples_stats":  21410,
-    "reservoir_size":     5000000
-  },
-  "global": {
-    "mean": [ 0.000231,  0.000179, -0.000319,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.638652,  0.000148, -0.000377, -0.000241,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.815273],
-    "std":  [ 0.014881,  0.008081,  0.014371,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.464058,  0.010628,  0.005868,  0.007900,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.366049],
-    "min":  [-0.115093, -0.096415, -0.112595, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000, -0.091252, -0.052148, -0.113650, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "max":  [ 0.114941,  0.063433,  0.098721,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.123908,  0.077951,  0.080229,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.999628],
-    "q01":  [-0.051367, -0.031964, -0.046482, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000, -0.035108, -0.021212, -0.029788, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "q99":  [ 0.043729,  0.021737,  0.036738,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.047581,  0.021270,  0.025712,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.995443]
-  },
-  "global_raw": {
-    "mean": [ 0.000231,  0.000179, -0.000319,  0.999350,  0.000318, -0.000135, -0.000286,  0.999586, -0.000051,  0.638652,  0.000148, -0.000377, -0.000241,  0.999294,  0.000343,  0.000431, -0.000147,  0.999580, -0.000570,  0.815273],
-    "std":  [ 0.014881,  0.008081,  0.014371,  0.002235,  0.020196,  0.029781,  0.020185,  0.001125,  0.020484,  0.464058,  0.010628,  0.005868,  0.007900,  0.002664,  0.025404,  0.027550,  0.025193,  0.001657,  0.014210,  0.366049],
-    "min":  [-0.115093, -0.096415, -0.112595,  0.944314, -0.271877, -0.325264, -0.254808,  0.962274, -0.227188,  0.000000, -0.091252, -0.052148, -0.113650,  0.941406, -0.265241, -0.273484, -0.290840,  0.954990, -0.264631,  0.000000],
-    "max":  [ 0.114941,  0.063433,  0.098721,  1.000000,  0.258475,  0.270230,  0.271943,  1.000000,  0.221936,  1.000000,  0.123908,  0.077951,  0.080229,  1.000000,  0.296517,  0.333596,  0.269131,  1.000000,  0.139695,  0.999628],
-    "q01":  [-0.051367, -0.031964, -0.046482,  0.988101, -0.053179, -0.128603, -0.075432,  0.994427, -0.059973,  0.000000, -0.035108, -0.021212, -0.029788,  0.986086, -0.098043, -0.111441, -0.093441,  0.991492, -0.058030,  0.000000],
-    "q99":  [ 0.043729,  0.021737,  0.036738,  1.000000,  0.075612,  0.102791,  0.053223,  1.000000,  0.077057,  1.000000,  0.047581,  0.021270,  0.025712,  1.000000,  0.095525,  0.126049,  0.098778,  1.000000,  0.041914,  0.995443]
-  }
-}

cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka_backward_framewise_rot6d.json DELETED Viewed

@@ -1,33 +0,0 @@
-{
-  "metadata": {
-    "embodiment_type":    "robomind-franka",
-    "pose_convention":    "backward_framewise",
-    "rotation_format":    "rot6d",
-    "action_dim":         10,
-    "skip_rotation_dims": [3, 4, 5, 6, 7, 8],
-    "chunk_length":       16,
-    "sample_stride":      16,
-    "dataset_name":       "robomind_franka_20260414",
-    "dataset_class":      "RoboMINDFrankaDataset",
-    "dataset_root":       "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228",
-    "split":              "train",
-    "num_samples_stats":  141658,
-    "reservoir_size":     5000000
-  },
-  "global": {
-    "mean": [ 0.000241,  0.000073, -0.000597,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.630501],
-    "std":  [ 0.020545,  0.010725,  0.022054,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  0.434021],
-    "min":  [-0.184377, -0.130924, -0.183947, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "max":  [ 0.227682,  0.134118,  0.133222,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000],
-    "q01":  [-0.065029, -0.030683, -0.075321, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000,  0.000000],
-    "q99":  [ 0.068546,  0.036309,  0.051772,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000]
-  },
-  "global_raw": {
-    "mean": [ 0.000241,  0.000073, -0.000597,  0.998782,  0.000605,  0.000003, -0.000592,  0.998245, -0.000508,  0.630501],
-    "std":  [ 0.020545,  0.010725,  0.022054,  0.004056,  0.043101,  0.023671,  0.043102,  0.004948,  0.040306,  0.434021],
-    "min":  [-0.184377, -0.130924, -0.183947,  0.837403, -0.525301, -0.384252, -0.543663,  0.801190, -0.490979,  0.000000],
-    "max":  [ 0.227682,  0.134118,  0.133222,  1.000000,  0.543800,  0.389145,  0.522029,  1.000000,  0.414190,  1.000000],
-    "q01":  [-0.065029, -0.030683, -0.075321,  0.981664, -0.137429, -0.069593, -0.140220,  0.976885, -0.140399,  0.000000],
-    "q99":  [ 0.068546,  0.036309,  0.051772,  1.000000,  0.140290,  0.079942,  0.137529,  1.000000,  0.113651,  1.000000]
-  }
-}

cosmos-framework/cosmos_framework/data/vfm/action/robomind_franka_dataset.py CHANGED Viewed

@@ -25,7 +25,6 @@ import torch
 import torch.nn.functional as F
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
-    ActionNormalization,
     ActionSpec,
     BaseActionLeRobotDataset,
     Gripper,
@@ -98,7 +97,6 @@ class RoboMINDFrankaDataset(BaseActionLeRobotDataset):
         mode: str = "policy",
         embodiment_type: str = "robomind-franka",
         pose_convention: str = "backward_framewise",
-        action_normalization: ActionNormalization | None = None,
         viewpoint: Viewpoint = "concat_view",
         enable_fast_init: bool = False,
     ) -> None:
@@ -119,7 +117,6 @@ class RoboMINDFrankaDataset(BaseActionLeRobotDataset):
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
-            action_normalization=action_normalization,
             tolerance_s=1e-4,
             enable_fast_init=enable_fast_init,
         )

 import torch.nn.functional as F
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
     ActionSpec,
     BaseActionLeRobotDataset,
     Gripper,
         mode: str = "policy",
         embodiment_type: str = "robomind-franka",
         pose_convention: str = "backward_framewise",
         viewpoint: Viewpoint = "concat_view",
         enable_fast_init: bool = False,
     ) -> None:
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
             tolerance_s=1e-4,
             enable_fast_init=enable_fast_init,
         )

cosmos-framework/cosmos_framework/data/vfm/action/umi_lerobot_dataset.py CHANGED Viewed

@@ -12,7 +12,7 @@ import torch
 from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
 from cosmos_framework.data.vfm.action.action_spec import ActionSpec, Gripper, Pos, Rot, build_action_spec
-from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import ActionNormalization, BaseActionLeRobotDataset
 from cosmos_framework.data.vfm.action.pose_utils import PoseConvention, build_abs_pose_from_components, pose_abs_to_rel
 from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
@@ -35,7 +35,6 @@ class UMIFastLeRobotDataset(BaseActionLeRobotDataset):
         split: Literal["train", "val", "full"] = "train",
         mode: str = "policy",
         pose_convention: PoseConvention = "backward_framewise",
-        action_normalization: ActionNormalization | None = None,
         viewpoint: Viewpoint = "ego_view",
         enable_fast_init: bool = False,
     ) -> None:
@@ -50,7 +49,6 @@ class UMIFastLeRobotDataset(BaseActionLeRobotDataset):
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
-            action_normalization=action_normalization,
             tolerance_s=1e-4,
             enable_fast_init=enable_fast_init,
         )

 from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
 from cosmos_framework.data.vfm.action.action_spec import ActionSpec, Gripper, Pos, Rot, build_action_spec
+from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import BaseActionLeRobotDataset
 from cosmos_framework.data.vfm.action.pose_utils import PoseConvention, build_abs_pose_from_components, pose_abs_to_rel
 from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
         split: Literal["train", "val", "full"] = "train",
         mode: str = "policy",
         pose_convention: PoseConvention = "backward_framewise",
         viewpoint: Viewpoint = "ego_view",
         enable_fast_init: bool = False,
     ) -> None:
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
             tolerance_s=1e-4,
             enable_fast_init=enable_fast_init,
         )

cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/README.md CHANGED Viewed

@@ -4,10 +4,15 @@ Interactive 3D viewer for release-supported robot action datasets. It renders
 RGB frames, end-effector trajectories, and robot meshes/IK with
 [`viser`](https://viser.studio/), MuJoCo, and Pinocchio.
-The OSS viewer registry is intentionally scoped to the public action datasets:
-- Bridge LeRobot v3: [`nvidia/bridge_lerobot_v3`](https://huggingface.co/datasets/nvidia/bridge_lerobot_v3)
-- LIBERO LeRobot v3: [`nvidia/LIBERO_LeRobot_v3`](https://huggingface.co/datasets/nvidia/LIBERO_LeRobot_v3)
 The dataset LazyConfig entries live in:
@@ -17,29 +22,10 @@ cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py
 ## Dataset roots
-Set dataset roots before launching. Dedicated variables are preferred when both
-datasets are available; `DATASET_PATH` remains a convenient fallback for a
-single downloaded dataset.
-```bash
-# Bridge root: downloaded from nvidia/bridge_lerobot_v3 or an existing mirror.
-export BRIDGE_LEROBOT_ROOT=/path/to/bridge_lerobot_v3
-# LIBERO root: parent directory containing libero_10/, libero_object/,
-# libero_spatial/, and libero_goal/.
-export LIBERO_ROOT=/path/to/LIBERO_LeRobot_v3
-```
-To download the public datasets instead:
-```bash
-uvx hf@latest download --repo-type dataset nvidia/bridge_lerobot_v3 \
-  --local-dir examples/data/bridge_lerobot_v3 --quiet
-uvx hf@latest download --repo-type dataset nvidia/LIBERO_LeRobot_v3 \
-  --revision ddc1edeb6e51e2b7d4d2ba7a1433daaecd37aa64 \
-  --local-dir examples/data/LIBERO_LeRobot_v3 --quiet
-```
 ## Local launch
@@ -53,8 +39,6 @@ Then launch from the repository root:
 ```bash
 export PYTHONPATH=.
-export BRIDGE_LEROBOT_ROOT=/path/to/bridge_lerobot_v3
-export LIBERO_ROOT=/path/to/LIBERO_LeRobot_v3
 python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py \
   --port 8020 \

 RGB frames, end-effector trajectories, and robot meshes/IK with
 [`viser`](https://viser.studio/), MuJoCo, and Pinocchio.
+The standalone viewer registry is intentionally scoped to the packaged demo datasets:
+- AV
+- Fractal
+- Bridge
+- DROID
+- UMI / FastUMI
+- RoboMIND Franka
+- RoboMIND Franka dual
 The dataset LazyConfig entries live in:
 ## Dataset roots
+The Docker Space uses packaged examples under `/app/assets/examples`. For local
+development, override individual roots only when needed, e.g. `BRIDGE_ROOT`,
+`FRACTAL_ROOT`, `DROID_ROOT`, `UMI_ROOT`, `ROBOMIND_FRANKA_ROOT`,
+`ROBOMIND_FRANKA_DUAL_ROOT`, or `AV_ROOT`.
 ## Local launch
 ```bash
 export PYTHONPATH=.
 python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py \
   --port 8020 \

cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py CHANGED Viewed

@@ -49,7 +49,6 @@ BRIDGE_ROOT = _env_path("BRIDGE_LEROBOT_ROOT", "DATASET_PATH", default="/app/ass
 DATASET_BRIDGE_480 = L(dataset_entry)(
     name="bridge_20260501",
     dataset=L(BridgeOrigLeRobotDataset)(
-        action_normalization="quantile_rot",
         chunk_length=16,
         enable_fast_init=True,
         fps=5.0,
@@ -72,7 +71,6 @@ DATASET_FRACTAL_256 = L(dataset_entry)(
         root=_env_path("FRACTAL_ROOT", default="/app/assets/examples/fractal20220817_data"),
         split="train",
         mode="joint",
-        action_normalization="quantile_rot",
         enable_fast_init=False,
     ),
     ratio=1,
@@ -86,7 +84,6 @@ DATASET_DROID_480 = L(dataset_entry)(
         split="train",
         use_success_only=True,
         mode="joint",
-        action_normalization="quantile_rot",
         enable_fast_init=False,
     ),
     ratio=1,
@@ -100,7 +97,6 @@ DATASET_ROBOMIND_FRANKA_480 = L(dataset_entry)(
         split="train",
         mode="joint",
         embodiment_type="robomind-franka",
-        action_normalization="quantile_rot",
         enable_fast_init=False,
     ),
     ratio=1,
@@ -114,7 +110,6 @@ DATASET_ROBOMIND_FRANKA_DUAL_480 = L(dataset_entry)(
         split="train",
         mode="joint",
         embodiment_type="robomind-franka-dual",
-        action_normalization="quantile_rot",
         enable_fast_init=False,
     ),
     ratio=1,
@@ -151,7 +146,6 @@ DATASET_UMI_256 = L(dataset_entry)(
         root=_env_path("UMI_ROOT", default="/app/assets/examples/fastumi/fastumi_single_arm/pour_coke"),
         split="train",
         mode="joint",
-        action_normalization=None,
         enable_fast_init=False,
     ),
     ratio=1,

 DATASET_BRIDGE_480 = L(dataset_entry)(
     name="bridge_20260501",
     dataset=L(BridgeOrigLeRobotDataset)(
         chunk_length=16,
         enable_fast_init=True,
         fps=5.0,
         root=_env_path("FRACTAL_ROOT", default="/app/assets/examples/fractal20220817_data"),
         split="train",
         mode="joint",
         enable_fast_init=False,
     ),
     ratio=1,
         split="train",
         use_success_only=True,
         mode="joint",
         enable_fast_init=False,
     ),
     ratio=1,
         split="train",
         mode="joint",
         embodiment_type="robomind-franka",
         enable_fast_init=False,
     ),
     ratio=1,
         split="train",
         mode="joint",
         embodiment_type="robomind-franka-dual",
         enable_fast_init=False,
     ),
     ratio=1,
         root=_env_path("UMI_ROOT", default="/app/assets/examples/fastumi/fastumi_single_arm/pour_coke"),
         split="train",
         mode="joint",
         enable_fast_init=False,
     ),
     ratio=1,

cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ik_solver.py CHANGED Viewed

@@ -575,8 +575,8 @@ def compute_mujoco_geom_transforms(
             for i, mi in enumerate(pin_to_mj_map):
                 data.qpos[mi] = q[i]
         else:
-            # For robots with a separate gripper ctrl signal (e.g. UR5e),
-            # only write arm joints — the 7th column is a raw gripper value,
             # not a qpos DOF. For other robots the pinocchio output already
             # includes finger joints in the trailing columns; write them all.
             n_set = n_arm if has_gripper_ctrl else len(q)

             for i, mi in enumerate(pin_to_mj_map):
                 data.qpos[mi] = q[i]
         else:
+            # For robots with a separate gripper ctrl signal, only write arm
+            # joints — the extra column is a raw gripper value,
             # not a qpos DOF. For other robots the pinocchio output already
             # includes finger joints in the trailing columns; write them all.
             n_set = n_arm if has_gripper_ctrl else len(q)

cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_action.py CHANGED Viewed

@@ -1,33 +1,23 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-"""Canonical 57D action representation with explicit input formats.
-57D layout::
-    [ego(9) | R_wrist(9) | R_fingers(15) | L_wrist(9) | L_fingers(15)]
-Each 9D SE(3) slot is ``[pos(3) + rot6d(6)]``.
-Each finger slot is 3D (position in wrist-local frame), 5 fingers × 3D = 15D.
-Any supported action format is converted to ``UnifiedAction(action_57d, mask)``
-before the viewer processes it. The mask explicitly declares which slots are valid.
 """
 from __future__ import annotations
-from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any
 import numpy as np
 import torch
-from cosmos_framework.utils import log
-from cosmos_framework.data.vfm.action.pose_utils import convert_rotation
-FINGER_NAMES = ("thumb", "index", "middle", "ring", "pinky")
-ALL_FINGERS = (True, True, True, True, True)
 NO_FINGERS = (False, False, False, False, False)
@@ -37,30 +27,15 @@ class ActionFormat(str, Enum):
     EGO_9D = "9d"
     SINGLE_ARM_10D = "10d"
     DUAL_ARM_20D = "20d"
-    UNIFIED_57D = "57d"
     @property
     def expected_dim(self) -> int:
-        """Return the exact trailing dimension required by this format."""
-        return {
-            ActionFormat.EGO_9D: 9,
-            ActionFormat.SINGLE_ARM_10D: 10,
-            ActionFormat.DUAL_ARM_20D: 20,
-            ActionFormat.UNIFIED_57D: 57,
-        }[self]
-# ─── Data Structures ─────────────────────────────────────────────────────────
 @dataclass
 class Action57DMask:
-    """Per-component validity for the 57D layout.
-    ``right_fingers`` / ``left_fingers`` are tuples of 5 bools
-    (thumb, index, middle, ring, pinky) — supports any combination from
-    2-finger grippers to full 5-finger hands.
-    """
     ego: bool = False
     right_wrist: bool = False
@@ -71,71 +46,41 @@ class Action57DMask:
 @dataclass
 class UnifiedAction:
-    """Canonical 57D action for the viewer pipeline.
-    ``action`` is always shape ``(T, 57)`` with invalid slots zero-padded.
-    ``gripper_right`` / ``gripper_left`` carry auxiliary scalar gripper data
-    for embodiments that don't map to finger positions (V-shape visualisation).
-    """
-    action: np.ndarray  # (T, 57)
     mask: Action57DMask
-    gripper_right: np.ndarray | None = None  # (T,) scalar 0-1
-    gripper_left: np.ndarray | None = None  # (T,) scalar 0-1
 @dataclass
 class SceneState:
-    """Render-ready world-space geometry reconstructed from ``UnifiedAction``.
-    Contract:
-    - all SE(3) trajectories live in one shared ``scene_world`` frame
-    - fingertip positions are world-space if present
-    - gripper signals are scalar open/close values sampled at ``T+1`` frames
-    """
-    mask: Action57DMask = field(default_factory=Action57DMask)
-    # Absolute SE(3) trajectories — (T+1, 4, 4)
     ego_poses: np.ndarray | None = None
     right_poses: np.ndarray | None = None
     left_poses: np.ndarray | None = None
-    # World-space fingertip positions — (T+1, 5, 3)
     right_fingers: np.ndarray | None = None
     left_fingers: np.ndarray | None = None
-    # Scalar gripper — (T+1,)
     gripper_right: np.ndarray | None = None
     gripper_left: np.ndarray | None = None
-    # Metadata
-    video: np.ndarray | None = None  # (T+1, H, W, 3) uint8
-    action_raw: np.ndarray | None = None  # canonical 57D action tensor for display
     T: int = 0
-    # FK mesh animation: raw (T, nq) joint configs populated by datasets that
-    # perform EE conversion internally (e.g. robomind-ur). When set, the renderer
-    # uses these for FK mesh animation instead of running IK on right_poses.
-    joint_configs: np.ndarray | None = None
-# ─── Converters ───────────────────────────────────────────────────────────────
-def to_unified_from_57d(action: np.ndarray) -> UnifiedAction:
-    """57D hand_pose → passthrough, all 5 slots valid."""
-    return UnifiedAction(
-        action=action.astype(np.float32),
-        mask=Action57DMask(
-            ego=True,
-            right_wrist=True,
-            right_fingers=ALL_FINGERS,
-            left_wrist=True,
-            left_fingers=ALL_FINGERS,
-        ),
-    )
 def to_unified_from_10d(action: np.ndarray) -> UnifiedAction:
     """10D single arm ``[pos(3)+rot6d(6)+grip(1)]`` → right wrist + gripper."""
-    T = action.shape[0]
-    a = np.zeros((T, 57), dtype=np.float32)  # [T,57]
     a[:, 9:18] = action[:, :9]
     return UnifiedAction(
         action=a,
@@ -145,75 +90,50 @@ def to_unified_from_10d(action: np.ndarray) -> UnifiedAction:
 def to_unified_from_20d(action: np.ndarray) -> UnifiedAction:
-    """20D dual arm ``[left(10) | right(10)]`` → both wrists + both grippers.
-    Data layout: ``[L_pos(3) + L_rot6d(6) + L_grip(1) | R_pos(3) + R_rot6d(6) + R_grip(1)]``.
-    Maps left arm → left wrist slot [33:42], right arm → right wrist slot [9:18].
-    """
-    T = action.shape[0]
-    a = np.zeros((T, 57), dtype=np.float32)  # [T,57]
-    a[:, 33:42] = action[:, :9]  # left arm → left wrist slot [33:42]
-    a[:, 9:18] = action[:, 10:19]  # right arm → right wrist slot [9:18]
     return UnifiedAction(
         action=a,
         mask=Action57DMask(right_wrist=True, left_wrist=True),
-        gripper_right=action[:, 19].astype(np.float32),  # right arm gripper
-        gripper_left=action[:, 9].astype(np.float32),  # left arm gripper
-    )
-def to_unified_from_9d(action: np.ndarray) -> UnifiedAction:
-    """9D camera/AV ``[pos(3)+rot6d(6)]`` → ego only."""
-    T = action.shape[0]
-    a = np.zeros((T, 57), dtype=np.float32)  # [T,57]
-    a[:, 0:9] = action[:, :9]
-    return UnifiedAction(
-        action=a,
-        mask=Action57DMask(ego=True),
     )
 def _validate_action_shape(action: np.ndarray, action_format: ActionFormat) -> None:
-    """Raise when a raw action tensor does not match its declared format."""
     if action.ndim != 2:
         raise ValueError(f"Expected a rank-2 action array, got shape {action.shape}")
-    actual_dim = int(action.shape[-1])
     expected_dim = action_format.expected_dim
     if actual_dim != expected_dim:
         raise ValueError(f"Action format {action_format.value} expects trailing dim {expected_dim}, got {actual_dim}")
 def to_unified(action: np.ndarray, action_format: ActionFormat) -> UnifiedAction:
-    """Convert one explicit raw action format into ``UnifiedAction``."""
     _validate_action_shape(action, action_format)
-    if action_format is ActionFormat.UNIFIED_57D:
-        return to_unified_from_57d(action)
-    if action_format is ActionFormat.DUAL_ARM_20D:
-        return to_unified_from_20d(action)
     if action_format is ActionFormat.EGO_9D:
         return to_unified_from_9d(action)
     if action_format is ActionFormat.SINGLE_ARM_10D:
         return to_unified_from_10d(action)
     raise ValueError(f"Unsupported action format: {action_format}")
 def _pos_rot6d_to_mat(se3: np.ndarray) -> np.ndarray:
     """Convert ``(N, 9)`` pos+rot6d to ``(N, 4, 4)`` SE(3) matrices."""
-    N = se3.shape[0]
     pos = se3[:, :3]
     r6 = se3[:, 3:9]
     col0 = r6[:, :3].copy()
-    col0_norm = np.linalg.norm(col0, axis=-1, keepdims=True) + 1e-8
-    col0 = col0 / col0_norm
     col1 = r6[:, 3:6] - np.sum(r6[:, 3:6] * col0, axis=-1, keepdims=True) * col0
-    col1_norm = np.linalg.norm(col1, axis=-1, keepdims=True) + 1e-8
-    col1 = col1 / col1_norm
     col2 = np.cross(col0, col1)
-    mats = np.tile(np.eye(4, dtype=np.float32), (N, 1, 1))
     mats[:, :3, 0] = col0
     mats[:, :3, 1] = col1
     mats[:, :3, 2] = col2
@@ -226,255 +146,77 @@ def _chain_se3(
     initial_pose: np.ndarray | None = None,
     pose_convention: str = "backward_framewise",
 ) -> np.ndarray:
-    """Chain ``(T, 9)`` relative deltas into ``(T+1, 4, 4)`` absolute poses.
-    For ``backward_framewise``: ``P_{t+1} = P_t @ delta_t``.
-    For ``absolute``: each row is already an absolute pose (no chaining).
-    """
-    T = deltas.shape[0]
     delta_mats = _pos_rot6d_to_mat(deltas)
-    if initial_pose is None:
-        initial_pose = np.eye(4, dtype=np.float32)
-    else:
-        initial_pose = initial_pose.astype(np.float32)
-    poses = np.empty((T + 1, 4, 4), dtype=np.float32)
-    poses[0] = initial_pose
     if pose_convention == "absolute":
         poses[1:] = delta_mats
     else:
-        for t in range(T):
             poses[t + 1] = poses[t] @ delta_mats[t]
     return poses
-def _extract_fingers(raw: np.ndarray) -> np.ndarray:
-    """``(T, 15)`` → ``(T+1, 5, 3)`` with first frame duplicated."""
-    T = raw.shape[0]
-    fingers = raw.reshape(T, 5, 3).astype(np.float32)  # [T,5,3]
-    return np.concatenate([fingers[:1], fingers], axis=0)
-def _to_numpy_float32(value: object) -> np.ndarray:
-    """Convert a tensor-like value to a float32 NumPy array."""
-    if isinstance(value, torch.Tensor):
-        return value.detach().cpu().numpy().astype(np.float32)
-    return np.asarray(value, dtype=np.float32)
-def _quat_xyzw_to_rotmat(q: np.ndarray) -> np.ndarray:
-    """Convert ``(N, 4)`` xyzw quaternions to ``(N, 3, 3)`` rotation matrices."""
-    x, y, z, w = q[:, 0], q[:, 1], q[:, 2], q[:, 3]
-    R = np.zeros((len(q), 3, 3), dtype=np.float32)
-    R[:, 0, 0] = 1 - 2 * (y * y + z * z)
-    R[:, 0, 1] = 2 * (x * y - z * w)
-    R[:, 0, 2] = 2 * (x * z + y * w)
-    R[:, 1, 0] = 2 * (x * y + z * w)
-    R[:, 1, 1] = 1 - 2 * (x * x + z * z)
-    R[:, 1, 2] = 2 * (y * z - x * w)
-    R[:, 2, 0] = 2 * (x * z - y * w)
-    R[:, 2, 1] = 2 * (y * z + x * w)
-    R[:, 2, 2] = 1 - 2 * (x * x + y * y)
-    return R
-def _build_absolute_from_overlay(sample: dict) -> dict[str, np.ndarray] | None:
-    """Build absolute world-frame poses from HandPoseDataset overlay data.
-    Returns None if overlay keys are missing.
-    """
-    raw_cam_pos = sample.get("raw_cam_position")
-    if raw_cam_pos is None:
-        return None
-    cam_pos = _to_numpy_float32(raw_cam_pos)  # [T+1,3]
-    cam_rot_q = _to_numpy_float32(sample["raw_cam_rotation"])  # [T+1,4]
-    right_3d = _to_numpy_float32(sample["raw_cam_right_3d"])  # [T+1,63]
-    left_3d = _to_numpy_float32(sample["raw_cam_left_3d"])  # [T+1,63]
-    right_rot = _to_numpy_float32(sample["raw_cam_right_rot"])  # [T+1,84]
-    left_rot = _to_numpy_float32(sample["raw_cam_left_rot"])  # [T+1,84]
-    T1 = cam_pos.shape[0]
-    FTIP = [4, 8, 12, 16, 20]
-    # Camera c2w (world frame)
-    cam_c2w = np.tile(np.eye(4, dtype=np.float32), (T1, 1, 1))  # [T+1,4,4]
-    cam_c2w[:, :3, 3] = cam_pos
-    cam_c2w[:, :3, :3] = _quat_xyzw_to_rotmat(cam_rot_q)
-    def _wrist_world(pos_63, rot_84):
-        wrist_pos = pos_63[:, :3]
-        wrist_q = rot_84.reshape(T1, 21, 4)[:, 0]
-        wrist_cam = np.tile(np.eye(4, dtype=np.float32), (T1, 1, 1))  # [T+1,4,4]
-        wrist_cam[:, :3, 3] = wrist_pos
-        wrist_cam[:, :3, :3] = _quat_xyzw_to_rotmat(wrist_q)
-        return cam_c2w @ wrist_cam
-    def _fingers_world(pos_63):
-        joints = pos_63.reshape(T1, 21, 3)[:, FTIP]
-        R = cam_c2w[:, :3, :3]
-        t = cam_c2w[:, :3, 3]
-        return np.einsum("tij,tfj->tfi", R, joints) + t[:, None, :]  # [T+1,5,3]
-    return {
-        "ego_poses": cam_c2w,
-        "right_wrist_poses": _wrist_world(right_3d, right_rot),
-        "left_wrist_poses": _wrist_world(left_3d, left_rot),
-        "right_fingers": _fingers_world(right_3d),
-        "left_fingers": _fingers_world(left_3d),
-    }
-def _build_libero_absolute_from_state(sample: dict) -> dict[str, np.ndarray] | None:
-    """Build absolute LIBERO right-wrist poses from ``observation.state``.
-    LIBERO policy actions are normalized robosuite controller commands rather
-    than meter-scale SE(3) deltas. For visualization, the raw state sequence is
-    the correct source of metric end-effector poses.
-    """
-    if sample.get("source_repo_id") is None or sample.get("state") is None:
-        return None
-    if "libero" not in str(sample.get("source_repo_id", "")).lower():
-        return None
-    from cosmos_framework.data.vfm.action.libero_pose_utils import build_libero_abs_pose
-    to_opencv = str(sample.get("pose_coordinate_frame", "native")) == "opencv"
-    right_poses = build_libero_abs_pose(sample["state"], to_opencv=to_opencv)
-    return {"right_wrist_poses": right_poses.astype(np.float32, copy=False)}
-# ─── Scene State Builder ─────────────────────────────────────────────────────
 def build_scene_state(
     unified: UnifiedAction,
     initial_pose: np.ndarray | None = None,
     initial_pose_right: np.ndarray | None = None,
     initial_pose_left: np.ndarray | None = None,
     right_base_pose: np.ndarray | None = None,
     left_base_pose: np.ndarray | None = None,
     pose_convention: str = "backward_framewise",
-    sample: dict | None = None,
 ) -> SceneState:
-    """Reconstruct a canonical world-space ``SceneState`` from ``UnifiedAction``.
-    Chains SE(3) deltas for valid mask slots. If ``sample`` contains overlay
-    data (HandPoseDataset raw camera/joint fields), overrides with absolute
-    world-frame poses.
-    Args:
-        unified: Canonical 57D action with mask.
-        initial_pose: Default initial pose for all slots.
-        initial_pose_right: Override for right wrist (dual arm).
-        initial_pose_left: Override for left wrist (dual arm).
-        right_base_pose: Right-arm base pose that maps arm-local trajectories into ``scene_world``.
-        left_base_pose: Left-arm base pose that maps arm-local trajectories into ``scene_world``.
-        pose_convention: Pose convention for SE(3) chaining.
-        sample: Raw dataset sample (for overlay data).
-    """
-    def _apply_pose_base(poses: np.ndarray | None, base_pose: np.ndarray | None) -> np.ndarray | None:
-        if poses is None or base_pose is None:
-            return poses
-        return np.einsum("ij,njk->nik", base_pose, poses).astype(np.float32)  # [T+1,4,4]
-    def _fingers_local_to_world(
-        fingers_local: np.ndarray | None,
-        wrist_poses_world: np.ndarray | None,
-    ) -> np.ndarray | None:
-        if fingers_local is None:
-            return None
-        if wrist_poses_world is None:
-            raise ValueError("Finger trajectories require matching wrist poses to build world-space SceneState")
-        wrist_rot = wrist_poses_world[:, :3, :3].astype(np.float32)  # [T+1,3,3]
-        wrist_pos = wrist_poses_world[:, :3, 3].astype(np.float32)  # [T+1,3]
-        return np.einsum("tij,tfj->tfi", wrist_rot, fingers_local) + wrist_pos[:, None, :]  # [T+1,5,3]
-    mask = unified.mask
     action = unified.action
-    state = SceneState(mask=mask)
-    ip_default = initial_pose if initial_pose is not None else np.eye(4, dtype=np.float32)
-    ip_right = initial_pose_right if initial_pose_right is not None else ip_default
-    ip_left = initial_pose_left if initial_pose_left is not None else ip_default
     if mask.ego:
-        state.ego_poses = _chain_se3(action[:, 0:9], ip_default, pose_convention)
     if mask.right_wrist:
-        state.right_poses = _chain_se3(action[:, 9:18], ip_right, pose_convention)
-    if any(mask.right_fingers):
-        state.right_fingers = _extract_fingers(action[:, 18:33])
     if mask.left_wrist:
-        state.left_poses = _chain_se3(action[:, 33:42], ip_left, pose_convention)
-    if any(mask.left_fingers):
-        state.left_fingers = _extract_fingers(action[:, 42:57])
-    if unified.gripper_right is not None:
-        g = unified.gripper_right
-        state.gripper_right = np.concatenate([[g[0]], g]).astype(np.float32, copy=False)  # [T+1]
-    if unified.gripper_left is not None:
-        g = unified.gripper_left
-        state.gripper_left = np.concatenate([[g[0]], g]).astype(np.float32, copy=False)  # [T+1]
-    abs_data = _build_absolute_from_overlay(sample) if sample is not None else None
-    if abs_data is not None:
-        state.ego_poses = abs_data["ego_poses"]
-        state.right_poses = abs_data["right_wrist_poses"]
-        state.left_poses = abs_data["left_wrist_poses"]
-        state.right_fingers = abs_data["right_fingers"]
-        state.left_fingers = abs_data["left_fingers"]
-        log.info(
-            f"Overlay absolute mode | ego range: "
-            f"[{abs_data['ego_poses'][:, :3, 3].min():.3f}, "
-            f"{abs_data['ego_poses'][:, :3, 3].max():.3f}] | "
-            f"R wrist[0]: {abs_data['right_wrist_poses'][0, :3, 3]}"
         )
-    else:
-        state.right_poses = _apply_pose_base(state.right_poses, right_base_pose)
-        state.left_poses = _apply_pose_base(state.left_poses, left_base_pose)
-    libero_abs_data = _build_libero_absolute_from_state(sample) if sample is not None else None
-    if libero_abs_data is not None:
-        state.right_poses = libero_abs_data["right_wrist_poses"]
-    state.right_fingers = _fingers_local_to_world(state.right_fingers, state.right_poses)
-    state.left_fingers = _fingers_local_to_world(state.left_fingers, state.left_poses)
-    state.action_raw = unified.action.astype(np.float32)
     state.T = action.shape[0]
     return state
-# ─── Video Extraction ─────────────────────────────────────────────────────────
 def get_video_from_sample(sample: dict) -> np.ndarray | None:
-    """Extract video frames from a dataset sample.
-    Returns ``(T+1, H, W, 3)`` uint8 array, or None.
-    """
     video = sample.get("video")
     if video is None:
         return None
     if isinstance(video, torch.Tensor):
         video = video.numpy()
     if video.ndim == 4:
-        C, T_dim, H, W = video.shape
-        if C in (1, 3) and T_dim > 3:
             video = np.transpose(video, (1, 2, 3, 0))
-        elif video.shape[1] in (1, 3) and T_dim <= 3:
             video = np.transpose(video, (0, 2, 3, 1))
     if video.dtype in (np.float32, np.float64):
         video = np.clip(video * 255, 0, 255).astype(np.uint8)
     if video.ndim == 4 and video.shape[-1] == 1:
         video = np.repeat(video, 3, axis=-1)
     return video

 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+"""Raw action converters for the standalone action viewer.
+The Space release is raw-action-only. It accepts 9D ego/camera actions, 10D
+single-arm actions, and 20D dual-arm actions. The renderer still uses a fixed
+57D internal canvas so the downstream drawing code can stay format-agnostic,
+but hand-pose/57D datasets are intentionally not part of this release.
 """
 from __future__ import annotations
+from dataclasses import dataclass
 from enum import Enum
 from typing import Any
 import numpy as np
 import torch
 NO_FINGERS = (False, False, False, False, False)
     EGO_9D = "9d"
     SINGLE_ARM_10D = "10d"
     DUAL_ARM_20D = "20d"
     @property
     def expected_dim(self) -> int:
+        return {self.EGO_9D: 9, self.SINGLE_ARM_10D: 10, self.DUAL_ARM_20D: 20}[self]
 @dataclass
 class Action57DMask:
+    """Per-component validity for the internal 57D rendering canvas."""
     ego: bool = False
     right_wrist: bool = False
 @dataclass
 class UnifiedAction:
+    """Internal 57D action plus scalar gripper side channels."""
+    action: np.ndarray
     mask: Action57DMask
+    gripper_right: np.ndarray | None = None
+    gripper_left: np.ndarray | None = None
 @dataclass
 class SceneState:
+    """Renderer-ready reconstructed trajectories for one sample."""
+    mask: Action57DMask
     ego_poses: np.ndarray | None = None
     right_poses: np.ndarray | None = None
     left_poses: np.ndarray | None = None
     right_fingers: np.ndarray | None = None
     left_fingers: np.ndarray | None = None
     gripper_right: np.ndarray | None = None
     gripper_left: np.ndarray | None = None
+    video: np.ndarray | None = None
+    action_raw: np.ndarray | None = None
     T: int = 0
+def to_unified_from_9d(action: np.ndarray) -> UnifiedAction:
+    """9D camera/AV ``[pos(3)+rot6d(6)]`` → ego only."""
+    a = np.zeros((action.shape[0], 57), dtype=np.float32)
+    a[:, 0:9] = action[:, :9]
+    return UnifiedAction(action=a, mask=Action57DMask(ego=True))
 def to_unified_from_10d(action: np.ndarray) -> UnifiedAction:
     """10D single arm ``[pos(3)+rot6d(6)+grip(1)]`` → right wrist + gripper."""
+    a = np.zeros((action.shape[0], 57), dtype=np.float32)
     a[:, 9:18] = action[:, :9]
     return UnifiedAction(
         action=a,
 def to_unified_from_20d(action: np.ndarray) -> UnifiedAction:
+    """20D dual arm ``[left(10) | right(10)]`` → both wrists + both grippers."""
+    a = np.zeros((action.shape[0], 57), dtype=np.float32)
+    a[:, 33:42] = action[:, :9]
+    a[:, 9:18] = action[:, 10:19]
     return UnifiedAction(
         action=a,
         mask=Action57DMask(right_wrist=True, left_wrist=True),
+        gripper_right=action[:, 19].astype(np.float32),
+        gripper_left=action[:, 9].astype(np.float32),
     )
 def _validate_action_shape(action: np.ndarray, action_format: ActionFormat) -> None:
     if action.ndim != 2:
         raise ValueError(f"Expected a rank-2 action array, got shape {action.shape}")
     expected_dim = action_format.expected_dim
+    actual_dim = int(action.shape[-1])
     if actual_dim != expected_dim:
         raise ValueError(f"Action format {action_format.value} expects trailing dim {expected_dim}, got {actual_dim}")
 def to_unified(action: np.ndarray, action_format: ActionFormat) -> UnifiedAction:
+    """Convert one supported raw action format into the internal render canvas."""
     _validate_action_shape(action, action_format)
     if action_format is ActionFormat.EGO_9D:
         return to_unified_from_9d(action)
     if action_format is ActionFormat.SINGLE_ARM_10D:
         return to_unified_from_10d(action)
+    if action_format is ActionFormat.DUAL_ARM_20D:
+        return to_unified_from_20d(action)
     raise ValueError(f"Unsupported action format: {action_format}")
 def _pos_rot6d_to_mat(se3: np.ndarray) -> np.ndarray:
     """Convert ``(N, 9)`` pos+rot6d to ``(N, 4, 4)`` SE(3) matrices."""
+    n = se3.shape[0]
     pos = se3[:, :3]
     r6 = se3[:, 3:9]
     col0 = r6[:, :3].copy()
+    col0 = col0 / (np.linalg.norm(col0, axis=-1, keepdims=True) + 1e-8)
     col1 = r6[:, 3:6] - np.sum(r6[:, 3:6] * col0, axis=-1, keepdims=True) * col0
+    col1 = col1 / (np.linalg.norm(col1, axis=-1, keepdims=True) + 1e-8)
     col2 = np.cross(col0, col1)
+    mats = np.tile(np.eye(4, dtype=np.float32), (n, 1, 1))
     mats[:, :3, 0] = col0
     mats[:, :3, 1] = col1
     mats[:, :3, 2] = col2
     initial_pose: np.ndarray | None = None,
     pose_convention: str = "backward_framewise",
 ) -> np.ndarray:
+    """Chain ``(T, 9)`` relative deltas into ``(T+1, 4, 4)`` absolute poses."""
     delta_mats = _pos_rot6d_to_mat(deltas)
+    initial = np.eye(4, dtype=np.float32) if initial_pose is None else initial_pose.astype(np.float32)
+    poses = np.empty((deltas.shape[0] + 1, 4, 4), dtype=np.float32)
+    poses[0] = initial
     if pose_convention == "absolute":
         poses[1:] = delta_mats
     else:
+        for t in range(deltas.shape[0]):
             poses[t + 1] = poses[t] @ delta_mats[t]
     return poses
+def _apply_pose_base(poses: np.ndarray | None, base_pose: np.ndarray | None) -> np.ndarray | None:
+    if poses is None or base_pose is None:
+        return poses
+    return np.einsum("ij,tjk->tik", base_pose.astype(np.float32), poses).astype(np.float32)
 def build_scene_state(
     unified: UnifiedAction,
+    *,
     initial_pose: np.ndarray | None = None,
     initial_pose_right: np.ndarray | None = None,
     initial_pose_left: np.ndarray | None = None,
     right_base_pose: np.ndarray | None = None,
     left_base_pose: np.ndarray | None = None,
     pose_convention: str = "backward_framewise",
+    sample: dict[str, Any] | None = None,
 ) -> SceneState:
+    """Reconstruct viewer trajectories from raw-action-derived ``UnifiedAction``."""
+    del sample
     action = unified.action
+    mask = unified.mask
+    state = SceneState(mask=mask, gripper_right=unified.gripper_right, gripper_left=unified.gripper_left)
     if mask.ego:
+        state.ego_poses = _chain_se3(action[:, 0:9], initial_pose, pose_convention)
     if mask.right_wrist:
+        state.right_poses = _chain_se3(
+            action[:, 9:18],
+            initial_pose_right if initial_pose_right is not None else initial_pose,
+            pose_convention,
+        )
     if mask.left_wrist:
+        state.left_poses = _chain_se3(
+            action[:, 33:42],
+            initial_pose_left if initial_pose_left is not None else initial_pose,
+            pose_convention,
         )
+    state.right_poses = _apply_pose_base(state.right_poses, right_base_pose)
+    state.left_poses = _apply_pose_base(state.left_poses, left_base_pose)
+    state.action_raw = action.astype(np.float32)
     state.T = action.shape[0]
     return state
 def get_video_from_sample(sample: dict) -> np.ndarray | None:
+    """Extract video frames from a dataset sample as ``(T+1, H, W, 3)`` uint8."""
     video = sample.get("video")
     if video is None:
         return None
     if isinstance(video, torch.Tensor):
         video = video.numpy()
     if video.ndim == 4:
+        c, t_dim, _, _ = video.shape
+        if c in (1, 3) and t_dim > 3:
             video = np.transpose(video, (1, 2, 3, 0))
+        elif video.shape[1] in (1, 3) and t_dim <= 3:
             video = np.transpose(video, (0, 2, 3, 1))
     if video.dtype in (np.float32, np.float64):
         video = np.clip(video * 255, 0, 255).astype(np.uint8)
     if video.ndim == 4 and video.shape[-1] == 1:
         video = np.repeat(video, 3, axis=-1)
     return video

cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_renderer.py CHANGED Viewed

@@ -15,7 +15,7 @@ import numpy as np
 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import RobotSceneModel
-from cosmos_framework.data.vfm.action.urdf_visualizer.unified_action import FINGER_NAMES, SceneState
 class UnifiedRenderer:
@@ -41,13 +41,7 @@ class UnifiedRenderer:
     COLOR_EGO_TOP = (231, 76, 60)  # red
     COLOR_RIGHT = (243, 156, 18)  # orange
     COLOR_LEFT = (155, 89, 182)  # purple
-    FINGER_COLORS = [
-        (231, 76, 60),
-        (241, 196, 15),
-        (46, 204, 113),
-        (52, 152, 219),
-        (155, 89, 182),
-    ]
     @staticmethod
     def _soften_color(color: tuple[int, int, int], mix: float = 0.35) -> tuple[int, int, int]:
@@ -135,23 +129,14 @@ class UnifiedRenderer:
             point_size=0.015,
             point_shape="circle",
         )
-        self.right_fingers = [
-            server.scene.add_icosphere(
-                self._p(f"/right/finger_{FINGER_NAMES[i]}"),
-                radius=self.TIP_RADIUS,
-                color=self.FINGER_COLORS[i],
-                position=(0.0, 0.0, 0.0),
-            )
-            for i in range(5)
-        ]
         self.right_gripper_tips = [
             server.scene.add_icosphere(
                 self._p(f"/right/gripper_tip_{side}"),
                 radius=self.TIP_RADIUS,
-                color=self.FINGER_COLORS[i],
                 position=(0.0, 0.0, 0.0),
             )
-            for i, side in enumerate(("thumb", "index"))
         ]
         # ── Left effector ──
@@ -171,23 +156,14 @@ class UnifiedRenderer:
             point_size=0.015,
             point_shape="circle",
         )
-        self.left_fingers = [
-            server.scene.add_icosphere(
-                self._p(f"/left/finger_{FINGER_NAMES[i]}"),
-                radius=self.TIP_RADIUS,
-                color=self.FINGER_COLORS[i],
-                position=(0.0, 0.0, 0.0),
-            )
-            for i in range(5)
-        ]
         self.left_gripper_tips = [
             server.scene.add_icosphere(
                 self._p(f"/left/gripper_tip_{side}"),
                 radius=self.TIP_RADIUS,
-                color=self.FINGER_COLORS[i],
                 position=(0.0, 0.0, 0.0),
             )
-            for i, side in enumerate(("thumb", "index"))
         ]
         # ── IK robot meshes ──
@@ -288,13 +264,6 @@ class UnifiedRenderer:
             self.right_traj,
             show,
         )
-        self._update_fingers(
-            t,
-            state.right_fingers,
-            mask.right_fingers,
-            self.right_fingers,
-            show,
-        )
         self._update_gripper(
             t,
             state.right_poses,
@@ -315,13 +284,6 @@ class UnifiedRenderer:
             self.left_traj,
             show,
         )
-        self._update_fingers(
-            t,
-            state.left_fingers,
-            mask.left_fingers,
-            self.left_fingers,
-            show,
-        )
         self._update_gripper(
             t,
             state.left_poses,
@@ -364,9 +326,6 @@ class UnifiedRenderer:
         def _v(active):
             return "✓" if active else "·"
-        gr = a[18:33].reshape(5, 3)
-        gl = a[42:57].reshape(5, 3)
         # Gripper auxiliary values (not in 57D vector)
         grip_r_str = ""
         grip_l_str = ""
@@ -383,10 +342,7 @@ class UnifiedRenderer:
             "",
             f"{_v(mask.right_wrist)} R wrist pos [9:12]   {_fmt(a[9:12])}",
             f"  {' ' * 1}    rot [12:18]  {_fmt(a[12:18])}",
-            f"  R fingers [18:33]",
         ]
-        for i, name in enumerate(FINGER_NAMES):
-            parts.append(f"  {_v(mask.right_fingers[i])} {name:7s} {_fmt(gr[i])}")
         if grip_r_str:
             parts.append(grip_r_str)
@@ -394,10 +350,7 @@ class UnifiedRenderer:
             "",
             f"{_v(mask.left_wrist)} L wrist pos [33:36]  {_fmt(a[33:36])}",
             f"  {' ' * 1}    rot [36:42]  {_fmt(a[36:42])}",
-            f"  L fingers [42:57]",
         ]
-        for i, name in enumerate(FINGER_NAMES):
-            parts.append(f"  {_v(mask.left_fingers[i])} {name:7s} {_fmt(gl[i])}")
         if grip_l_str:
             parts.append(grip_l_str)
@@ -476,26 +429,6 @@ class UnifiedRenderer:
             ee.visible = False
             traj.visible = False
-    # ─── Private: Fingers ─────────────────────────────────────────────────────
-    def _update_fingers(self, t, fingers, finger_mask, handles, show):
-        if fingers is None or t >= len(fingers):
-            for h in handles:
-                h.visible = False
-            return
-        if not show.get("fingertips", True):
-            for h in handles:
-                h.visible = False
-            return
-        g = fingers[t]  # (5, 3)
-        for fi, h in enumerate(handles):
-            if finger_mask[fi]:
-                h.position = g[fi].astype(np.float32)
-                h.visible = True
-            else:
-                h.visible = False
     # ─── Private: Gripper ─────────────────────────────────────────────────────
     def _update_gripper(self, t, poses, gripper, wrist_active, finger_mask, handle, show):
@@ -702,27 +635,6 @@ class UnifiedRenderer:
         if self._robot_scene_model is None:
             return
-        # Joint-position datasets (e.g. robomind-ur): bypass IK, use FK directly
-        if state.joint_configs is not None:
-            from cosmos_framework.data.vfm.action.urdf_visualizer.ik_solver import compute_mujoco_geom_transforms
-            from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import get_mjcf_path
-            try:
-                mjcf_path = get_mjcf_path(entry.robot_name)
-                transforms, _, _fk_ee_poses, robot_frames = compute_mujoco_geom_transforms(
-                    mjcf_path, state.joint_configs
-                )
-                self._ik_right = transforms
-                self._robot_frames_right = robot_frames
-                self._rebuild_robot_frame_handles("right", robot_frames)
-                log.info(f"FK geom transforms computed for {len(transforms)} frames ({entry.robot_name})")
-            except Exception as e:
-                log.warning(f"FK failed for {entry.robot_name}: {e}")
-                import traceback
-                traceback.print_exc()
-            return
         # Right arm IK
         if state.right_poses is not None:
             try:
@@ -807,8 +719,6 @@ class UnifiedRenderer:
             self.left_traj,
         ]:
             attr.visible = False
-        for h in self.right_fingers + self.left_fingers:
-            h.visible = False
         for h in self.right_gripper_tips + self.left_gripper_tips:
             h.visible = False
         for h in self.robot_right + self.robot_left:

 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import RobotSceneModel
+from cosmos_framework.data.vfm.action.urdf_visualizer.unified_action import SceneState
 class UnifiedRenderer:
     COLOR_EGO_TOP = (231, 76, 60)  # red
     COLOR_RIGHT = (243, 156, 18)  # orange
     COLOR_LEFT = (155, 89, 182)  # purple
+    GRIPPER_TIP_COLORS = [(231, 76, 60), (241, 196, 15)]
     @staticmethod
     def _soften_color(color: tuple[int, int, int], mix: float = 0.35) -> tuple[int, int, int]:
             point_size=0.015,
             point_shape="circle",
         )
         self.right_gripper_tips = [
             server.scene.add_icosphere(
                 self._p(f"/right/gripper_tip_{side}"),
                 radius=self.TIP_RADIUS,
+                color=self.GRIPPER_TIP_COLORS[i],
                 position=(0.0, 0.0, 0.0),
             )
+            for i, side in enumerate(("left", "right"))
         ]
         # ── Left effector ──
             point_size=0.015,
             point_shape="circle",
         )
         self.left_gripper_tips = [
             server.scene.add_icosphere(
                 self._p(f"/left/gripper_tip_{side}"),
                 radius=self.TIP_RADIUS,
+                color=self.GRIPPER_TIP_COLORS[i],
                 position=(0.0, 0.0, 0.0),
             )
+            for i, side in enumerate(("left", "right"))
         ]
         # ── IK robot meshes ──
             self.right_traj,
             show,
         )
         self._update_gripper(
             t,
             state.right_poses,
             self.left_traj,
             show,
         )
         self._update_gripper(
             t,
             state.left_poses,
         def _v(active):
             return "✓" if active else "·"
         # Gripper auxiliary values (not in 57D vector)
         grip_r_str = ""
         grip_l_str = ""
             "",
             f"{_v(mask.right_wrist)} R wrist pos [9:12]   {_fmt(a[9:12])}",
             f"  {' ' * 1}    rot [12:18]  {_fmt(a[12:18])}",
         ]
         if grip_r_str:
             parts.append(grip_r_str)
             "",
             f"{_v(mask.left_wrist)} L wrist pos [33:36]  {_fmt(a[33:36])}",
             f"  {' ' * 1}    rot [36:42]  {_fmt(a[36:42])}",
         ]
         if grip_l_str:
             parts.append(grip_l_str)
             ee.visible = False
             traj.visible = False
     # ─── Private: Gripper ─────────────────────────────────────────────────────
     def _update_gripper(self, t, poses, gripper, wrist_active, finger_mask, handle, show):
         if self._robot_scene_model is None:
             return
         # Right arm IK
         if state.right_poses is not None:
             try:
             self.left_traj,
         ]:
             attr.visible = False
         for h in self.right_gripper_tips + self.left_gripper_tips:
             h.visible = False
         for h in self.robot_right + self.robot_left:

cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ur5e_robotiq_2f85.xml DELETED Viewed

@@ -1,326 +0,0 @@
-<!--
- SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- SPDX-License-Identifier: OpenMDW-1.1
--->
-<!-- UR5e + Robotiq 2F-85 gripper composite MJCF.
-     Created for RoboMIND UR visualization.
-     Arm (UR5e): qpos[0:6] — shoulder_pan, shoulder_lift, elbow, wrist_1, wrist_2, wrist_3
-     Gripper (Robotiq 2F-85): controlled via fingers_actuator tendon (ctrlrange 0-255).
-     Gripper signal from dataset: 0=open, 1=closed; scale by 255 before setting ctrl.
-     Mesh assets from MuJoCo Menagerie:
-       universal_robots_ur5e/assets/  (UR5e OBJ meshes)
-       robotiq_2f85_v4/assets/        (Robotiq STL meshes)
-     Both are merged into a single assets/ directory by _build_ur5e_robotiq().
--->
-<mujoco model="ur5e with robotiq 2f85">
-  <compiler angle="radian" meshdir="assets" autolimits="true"/>
-  <option integrator="implicitfast"/>
-  <default>
-    <default class="ur5e">
-      <material specular="0.5" shininess="0.25"/>
-      <joint axis="0 1 0" range="-6.28319 6.28319" armature="0.1"/>
-      <general gaintype="fixed" biastype="affine" ctrlrange="-6.2831 6.2831" gainprm="2000"
-        biasprm="0 -2000 -400" forcerange="-150 150"/>
-      <default class="size3">
-        <default class="size3_limited">
-          <joint range="-3.1415 3.1415"/>
-          <general ctrlrange="-3.1415 3.1415"/>
-        </default>
-      </default>
-      <default class="size1">
-        <general gainprm="500" biasprm="0 -500 -100" forcerange="-28 28"/>
-      </default>
-      <default class="visual">
-        <geom type="mesh" contype="0" conaffinity="0" group="2"/>
-      </default>
-      <default class="collision">
-        <geom type="capsule" group="3"/>
-        <default class="eef_collision">
-          <geom type="cylinder"/>
-        </default>
-      </default>
-      <site size="0.001" rgba="0.5 0.5 0.5 0.3" group="4"/>
-    </default>
-    <default class="2f85">
-      <mesh scale="0.001 0.001 0.001"/>
-      <general biastype="affine"/>
-      <joint axis="0 0 1"/>
-      <default class="driver">
-        <joint range="0 0.9" armature="0.005" damping="0.1"
-          solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
-      </default>
-      <default class="follower">
-        <joint range="-0.872664 0.9" armature="0.001"
-          solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
-      </default>
-      <default class="spring_link">
-        <joint range="-0.29670597283 0.9" armature="0.001"
-          stiffness="0.05" springref="2.62" damping="0.00125"/>
-      </default>
-      <default class="coupler">
-        <joint range="-1.57 0" armature="0.001"
-          solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
-      </default>
-      <default class="visual_gripper">
-        <geom type="mesh" contype="0" conaffinity="0" group="2" material="robotiq_black"/>
-      </default>
-      <default class="collision_gripper">
-        <geom group="3" type="mesh" contype="0" conaffinity="0"/>
-        <default class="pad_box1">
-          <geom mass="0.00175" type="box" pos="0.043258 0 0.12"
-            size="0.002 0.011 0.009375" friction="0.7"
-            solimp="0.95 0.99 0.001" solref="0.004 1" priority="1"
-            rgba="0.55 0.55 0.55 1"/>
-        </default>
-        <default class="pad_box2">
-          <geom mass="0.00175" type="box" pos="0.043258 0 0.13875"
-            size="0.002 0.011 0.009375" friction="0.6"
-            solimp="0.95 0.99 0.001" solref="0.004 1" priority="1"
-            rgba="0.45 0.45 0.45 1"/>
-        </default>
-      </default>
-    </default>
-  </default>
-  <asset>
-    <!-- UR5e materials -->
-    <material class="ur5e" name="ur5e_black"     rgba="0.033 0.033 0.033 1"/>
-    <material class="ur5e" name="jointgray"      rgba="0.278 0.278 0.278 1"/>
-    <material class="ur5e" name="linkgray"       rgba="0.82 0.82 0.82 1"/>
-    <material class="ur5e" name="urblue"         rgba="0.49 0.678 0.8 1"/>
-    <!-- Robotiq materials -->
-    <material name="robotiq_metal"   rgba="0.58 0.58 0.58 1"/>
-    <material name="robotiq_silicone" rgba="0.1882 0.1882 0.1882 1"/>
-    <material name="robotiq_black"   rgba="0.149 0.149 0.149 1"/>
-    <!-- UR5e meshes -->
-    <mesh file="base_0.obj"/>
-    <mesh file="base_1.obj"/>
-    <mesh file="shoulder_0.obj"/>
-    <mesh file="shoulder_1.obj"/>
-    <mesh file="shoulder_2.obj"/>
-    <mesh file="upperarm_0.obj"/>
-    <mesh file="upperarm_1.obj"/>
-    <mesh file="upperarm_2.obj"/>
-    <mesh file="upperarm_3.obj"/>
-    <mesh file="forearm_0.obj"/>
-    <mesh file="forearm_1.obj"/>
-    <mesh file="forearm_2.obj"/>
-    <mesh file="forearm_3.obj"/>
-    <mesh file="wrist1_0.obj"/>
-    <mesh file="wrist1_1.obj"/>
-    <mesh file="wrist1_2.obj"/>
-    <mesh file="wrist2_0.obj"/>
-    <mesh file="wrist2_1.obj"/>
-    <mesh file="wrist2_2.obj"/>
-    <mesh file="wrist3.obj"/>
-    <!-- Robotiq 2F-85 meshes — NO class: STL assets are in meters, scale=0.001 must NOT apply -->
-    <mesh file="base.stl"/>
-    <mesh file="base_coupling.stl"/>
-    <mesh file="c-a01-85-open.stl"/>
-    <mesh file="driver.stl"/>
-    <mesh file="coupler.stl"/>
-    <mesh file="spring_link.stl"/>
-    <mesh file="follower.stl"/>
-    <mesh file="tongue.stl"/>
-  </asset>
-  <worldbody>
-    <light name="spotlight" mode="targetbodycom" target="wrist_2_link" pos="0 -1 2"/>
-    <body name="base" quat="0 0 0 -1" childclass="ur5e">
-      <inertial mass="4.0" pos="0 0 0" diaginertia="0.00443333156 0.00443333156 0.0072"/>
-      <geom mesh="base_0" material="ur5e_black" class="visual"/>
-      <geom mesh="base_1" material="jointgray" class="visual"/>
-      <body name="shoulder_link" pos="0 0 0.163">
-        <inertial mass="3.7" pos="0 0 0" diaginertia="0.0102675 0.0102675 0.00666"/>
-        <joint name="shoulder_pan_joint" class="size3" axis="0 0 1"/>
-        <geom mesh="shoulder_0" material="urblue" class="visual"/>
-        <geom mesh="shoulder_1" material="ur5e_black" class="visual"/>
-        <geom mesh="shoulder_2" material="jointgray" class="visual"/>
-        <geom class="collision" size="0.06 0.06" pos="0 0 -0.04"/>
-        <body name="upper_arm_link" pos="0 0.138 0" quat="1 0 1 0">
-          <inertial mass="8.393" pos="0 0 0.2125" diaginertia="0.133886 0.133886 0.0151074"/>
-          <joint name="shoulder_lift_joint" class="size3"/>
-          <geom mesh="upperarm_0" material="linkgray" class="visual"/>
-          <geom mesh="upperarm_1" material="ur5e_black" class="visual"/>
-          <geom mesh="upperarm_2" material="jointgray" class="visual"/>
-          <geom mesh="upperarm_3" material="urblue" class="visual"/>
-          <geom class="collision" pos="0 -0.04 0" quat="1 1 0 0" size="0.06 0.06"/>
-          <geom class="collision" size="0.05 0.2" pos="0 0 0.2"/>
-          <body name="forearm_link" pos="0 -0.131 0.425">
-            <inertial mass="2.275" pos="0 0 0.196" diaginertia="0.0311796 0.0311796 0.004095"/>
-            <joint name="elbow_joint" class="size3_limited"/>
-            <geom mesh="forearm_0" material="urblue" class="visual"/>
-            <geom mesh="forearm_1" material="linkgray" class="visual"/>
-            <geom mesh="forearm_2" material="ur5e_black" class="visual"/>
-            <geom mesh="forearm_3" material="jointgray" class="visual"/>
-            <geom class="collision" pos="0 0.08 0" quat="1 1 0 0" size="0.055 0.06"/>
-            <geom class="collision" size="0.038 0.19" pos="0 0 0.2"/>
-            <body name="wrist_1_link" pos="0 0 0.392" quat="1 0 1 0">
-              <inertial mass="1.219" pos="0 0.127 0" diaginertia="0.0025599 0.0025599 0.0021942"/>
-              <joint name="wrist_1_joint" class="size1"/>
-              <geom mesh="wrist1_0" material="ur5e_black" class="visual"/>
-              <geom mesh="wrist1_1" material="urblue" class="visual"/>
-              <geom mesh="wrist1_2" material="jointgray" class="visual"/>
-              <geom class="collision" pos="0 0.05 0" quat="1 1 0 0" size="0.04 0.07"/>
-              <body name="wrist_2_link" pos="0 0.127 0">
-                <inertial mass="1.219" pos="0 0 0.1" diaginertia="0.0025599 0.0025599 0.0021942"/>
-                <joint name="wrist_2_joint" axis="0 0 1" class="size1"/>
-                <geom mesh="wrist2_0" material="ur5e_black" class="visual"/>
-                <geom mesh="wrist2_1" material="urblue" class="visual"/>
-                <geom mesh="wrist2_2" material="jointgray" class="visual"/>
-                <geom class="collision" size="0.04 0.06" pos="0 0 0.04"/>
-                <geom class="collision" pos="0 0.02 0.1" quat="1 1 0 0" size="0.04 0.04"/>
-                <body name="wrist_3_link" pos="0 0 0.1">
-                  <inertial mass="0.1889" pos="0 0.0771683 0" quat="1 0 0 1"
-                    diaginertia="0.000132134 9.90863e-05 9.90863e-05"/>
-                  <joint name="wrist_3_joint" class="size1"/>
-                  <geom material="linkgray" mesh="wrist3" class="visual"/>
-                  <geom class="eef_collision" pos="0 0.08 0" quat="1 1 0 0" size="0.04 0.02"/>
-                  <site name="attachment_site" pos="0 0.1 0" quat="-1 1 0 0"/>
-                  <!-- Robotiq 2F-85 attached at the UR5e flange site -->
-                  <body name="robotiq_base" childclass="2f85" pos="0 0.1 0" quat="-1 1 0 0">
-                    <inertial mass="0.777441" pos="7.77116e-05 8.42713e-05 0.0311656"
-                      quat="0.704758 -0.00373684 -0.00570287 0.709415"
-                      diaginertia="0.000260285 0.000225381 0.000152708"/>
-                    <geom class="visual_gripper" pos="0 0 0.0108" quat="0 0 0 1" mesh="base"/>
-                    <geom class="visual_gripper" pos="0 0 0.004" quat="1 -1 0 0" mesh="base_coupling" material="robotiq_metal"/>
-                    <geom class="visual_gripper" pos="0 0 0.0108" quat="1 0 0 0" material="robotiq_metal" mesh="c-a01-85-open"/>
-                    <geom class="collision_gripper" pos="0 0 0.0108" quat="0 0 0 1" mesh="base"/>
-                    <body name="robotiq_left_driver" pos="-0.0306011 0.00475 0.0657045" quat="1 -1 0 0">
-                      <inertial mass="0.00899563" pos="-0.0175297 0.00165308 -0.00469625"
-                        quat="-0.469642 0.469642 -0.528617 0.528617"
-                        diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
-                      <joint name="left_driver_joint" class="driver"/>
-                      <geom class="visual_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" material="robotiq_metal" mesh="driver"/>
-                      <geom class="collision_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" mesh="driver"/>
-                      <body name="robotiq_left_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
-                        <inertial mass="0.0140974" pos="0.00367747 0.01986 0.0055"
-                          quat="0.701447 -0.701447 0.0892884 -0.0892884"
-                          diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
-                        <geom class="visual_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
-                        <geom class="collision_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
-                      </body>
-                    </body>
-                    <body name="robotiq_left_spring_link" pos="-0.0127 -0.012 0.07222" quat="1 -1 0 0">
-                      <inertial mass="0.0221642" pos="-0.0183 -0.0205732 0.01205"
-                        quat="0.660941 0.660941 -0.251309 -0.251309"
-                        diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
-                      <joint name="left_spring_link_joint" class="spring_link"/>
-                      <geom class="visual_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
-                      <geom class="collision_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
-                      <body name="robotiq_left_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 -1.90231e-05 0">
-                        <inertial mass="0.0125222" pos="-0.00852976 -0.0014822 -0.00910001"
-                          quat="0.359439 0.359439 0.608937 0.608937"
-                          diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
-                        <joint name="left_follower_joint" class="follower"/>
-                        <geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
-                        <geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" material="robotiq_metal" mesh="tongue"/>
-                        <geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
-                        <geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="tongue"/>
-                        <body name="robotiq_left_pad" pos="-0.0377897 -0.103916 -0.0091" quat="1 -1 0 0">
-                          <geom class="pad_box1" name="left_pad1"/>
-                          <geom class="pad_box2" name="left_pad2"/>
-                        </body>
-                      </body>
-                    </body>
-                    <body name="robotiq_right_driver" pos="0.0306011 -0.00475 0.0657045" quat="0 0 -1 1">
-                      <inertial mass="0.00899563" pos="-0.0175297 0.00165308 -0.00469625"
-                        quat="-0.469642 0.469642 -0.528617 0.528617"
-                        diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
-                      <joint name="right_driver_joint" class="driver"/>
-                      <geom class="visual_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" material="robotiq_metal" mesh="driver"/>
-                      <geom class="collision_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" mesh="driver"/>
-                      <body name="robotiq_right_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
-                        <inertial mass="0.0140974" pos="0.00367747 0.01986 0.0055"
-                          quat="0.701447 -0.701447 0.0892884 -0.0892884"
-                          diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
-                        <geom class="visual_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
-                        <geom class="collision_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
-                      </body>
-                    </body>
-                    <body name="robotiq_right_spring_link" pos="0.0127 0.012 0.07222" quat="0 0 -1 1">
-                      <inertial mass="0.0221642" pos="-0.0183 -0.0205732 0.01205"
-                        quat="0.660941 0.660941 -0.251309 -0.251309"
-                        diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
-                      <joint name="right_spring_link_joint" class="spring_link"/>
-                      <geom class="visual_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
-                      <geom class="collision_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
-                      <body name="robotiq_right_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 0 0">
-                        <inertial mass="0.0125222" pos="-0.00852976 -0.0014822 -0.00910001"
-                          quat="0.359439 0.359439 0.608937 0.608937"
-                          diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
-                        <joint name="right_follower_joint" class="follower"/>
-                        <geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" material="robotiq_metal" mesh="tongue"/>
-                        <geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
-                        <geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="tongue"/>
-                        <geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
-                        <body name="robotiq_right_pad" pos="-0.0377897 -0.103916 -0.0091" quat="1 -1 0 0">
-                          <geom class="pad_box1" name="right_pad1"/>
-                          <geom class="pad_box2" name="right_pad2"/>
-                        </body>
-                      </body>
-                    </body>
-                  </body>
-                </body>
-              </body>
-            </body>
-          </body>
-        </body>
-      </body>
-    </body>
-  </worldbody>
-  <!-- UR5e arm actuators -->
-  <actuator>
-    <general class="size3"         name="shoulder_pan"  joint="shoulder_pan_joint"/>
-    <general class="size3"         name="shoulder_lift" joint="shoulder_lift_joint"/>
-    <general class="size3_limited" name="elbow"         joint="elbow_joint"/>
-    <general class="size1"         name="wrist_1"       joint="wrist_1_joint"/>
-    <general class="size1"         name="wrist_2"       joint="wrist_2_joint"/>
-    <general class="size1"         name="wrist_3"       joint="wrist_3_joint"/>
-    <!-- Robotiq gripper actuator (ctrl 0=open, 255=closed) -->
-    <general class="2f85" name="fingers_actuator" tendon="split"
-      forcerange="-5 5" ctrlrange="0 255"
-      gainprm="0.3137255 0 0" biasprm="0 -100 -10"/>
-  </actuator>
-  <contact>
-    <exclude body1="robotiq_base" body2="robotiq_left_driver"/>
-    <exclude body1="robotiq_base" body2="robotiq_right_driver"/>
-    <exclude body1="robotiq_base" body2="robotiq_left_spring_link"/>
-    <exclude body1="robotiq_base" body2="robotiq_right_spring_link"/>
-    <exclude body1="robotiq_right_coupler" body2="robotiq_right_follower"/>
-    <exclude body1="robotiq_left_coupler" body2="robotiq_left_follower"/>
-  </contact>
-  <tendon>
-    <fixed name="split">
-      <joint joint="right_driver_joint" coef="0.485"/>
-      <joint joint="left_driver_joint"  coef="0.485"/>
-    </fixed>
-  </tendon>
-  <equality>
-    <connect anchor="-0.0179014 -0.00651468 0.0044"
-      body1="robotiq_right_follower" body2="robotiq_right_coupler"
-      solimp="0.95 0.99 0.001" solref="0.005 1"/>
-    <connect anchor="-0.0179014 -0.00651468 0.0044"
-      body1="robotiq_left_follower"  body2="robotiq_left_coupler"
-      solimp="0.95 0.99 0.001" solref="0.005 1"/>
-    <joint joint1="right_driver_joint" joint2="left_driver_joint"
-      polycoef="0 1 0 0 0" solimp="0.95 0.99 0.001" solref="0.005 1"/>
-  </equality>
-  <keyframe>
-    <key name="home" qpos="-1.5708 -1.5708 1.5708 -1.5708 -1.5708 0 0 0 0 0 0 0"
-      ctrl="-1.5708 -1.5708 1.5708 -1.5708 -1.5708 0 0"/>
-  </keyframe>
-</mujoco>

cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/urdf_loader.py CHANGED Viewed

@@ -164,7 +164,6 @@ _EE_FRAME_CANDIDATES = [
     "gripper",  # Google Robot (Menagerie site)
     "hand",  # Franka Panda (standard gripper)
     "attachment",  # Franka + Robotiq
-    "attachment_site",  # UR5e (mujoco_menagerie)
     "wx250s/gripper_link",  # WidowX 250S (Menagerie)
     "ee_link",  # generic
     "tool0",  # generic industrial
@@ -257,19 +256,6 @@ ROBOT_CONFIGS = {
         "finger_joint_names": ["left_finger", "right_finger"],
         "camera_body": None,
     },
-    "ur5e": {
-        "menagerie": "ur5e_robotiq",
-        "mjcf": "ur5e_robotiq_2f85.xml",
-        # EE: attachment_site — the UR5e flange site where the Robotiq mounts.
-        # joint_configs[:, 6] raw UR gripper maps directly to Robotiq ctrl:
-        # ctrl = raw * 255 (0=open, 255=closed).
-        "ee_frame": "attachment_site",
-        "n_arm_joints": 6,
-        "finger_min": 0.0,
-        "finger_max": 255.0,
-        "finger_close_is_max": False,
-        "camera_body": None,
-    },
 }
@@ -344,10 +330,6 @@ def _ensure_robot_assets(robot_name: str) -> Path:
     if robot_name == "droid_franka_robotiq":
         return _build_droid_franka_robotiq()
-    # Composite: UR5e arm + Robotiq 2F-85 gripper (RoboMIND UR setup)
-    if robot_name == "ur5e_robotiq":
-        return _build_ur5e_robotiq()
     log.info(f"Downloading {robot_name} assets from mujoco_menagerie...")
     _download_menagerie_model(robot_name)
     return cached_dir
@@ -420,38 +402,6 @@ def _build_droid_franka_robotiq() -> Path:
     return dst
-def _build_ur5e_robotiq() -> Path:
-    """Prepare composite UR5e + Robotiq 2F-85 MJCF for RoboMIND UR visualization.
-    The hand-tuned composite XML is committed in the repo alongside this
-    module (``ur5e_robotiq_2f85.xml``).  This function downloads the mesh
-    assets from MuJoCo Menagerie (``universal_robots_ur5e`` +
-    ``robotiq_2f85_v4``) and copies them together with the XML into a cache
-    directory that MuJoCo can load from.
-    """
-    import shutil
-    log.info("Preparing composite UR5e + Robotiq 2F-85 (committed XML + Menagerie assets)...")
-    ur5e_dir = _download_menagerie_model("universal_robots_ur5e")
-    robotiq_dir = _download_menagerie_model("robotiq_2f85_v4")
-    dst = _ROBOT_CACHE_DIR / "ur5e_robotiq"
-    dst.mkdir(parents=True, exist_ok=True)
-    assets_dir = dst / "assets"
-    assets_dir.mkdir(exist_ok=True)
-    for f in (ur5e_dir / "assets").iterdir():
-        shutil.copy2(f, assets_dir / f.name)
-    for f in (robotiq_dir / "assets").iterdir():
-        shutil.copy2(f, assets_dir / f.name)
-    committed_xml = Path(__file__).parent / "ur5e_robotiq_2f85.xml"
-    shutil.copy2(committed_xml, dst / "ur5e_robotiq_2f85.xml")
-    log.info(f"Prepared composite UR5e + Robotiq 2F-85 at {dst}")
-    return dst
 # ── Robot loaders ────────────────────────────────────────────────────────────
@@ -563,35 +513,6 @@ def _load_widowx() -> tuple[list, np.ndarray]:
     return meshes, ee_pose
-def _load_ur5e() -> tuple[list, np.ndarray]:
-    """Load UR5e + Robotiq 2F-85 composite from MuJoCo Menagerie."""
-    import mujoco
-    mjcf_dir = _ensure_robot_assets("ur5e_robotiq")
-    mjcf_path = mjcf_dir / "ur5e_robotiq_2f85.xml"
-    model = mujoco.MjModel.from_xml_path(str(mjcf_path))
-    data = mujoco.MjData(model)
-    # UR5e home: -90, -90, 90, -90, -90, 0 (degrees → radians); gripper open
-    home_qpos = np.array([-1.5708, -1.5708, 1.5708, -1.5708, -1.5708, 0.0])
-    data.qpos[: len(home_qpos)] = home_qpos
-    mujoco.mj_forward(model, data)
-    meshes = _extract_mujoco_meshes(model, data)
-    # EE pose: use robotiq_base body (the flange-to-gripper attachment point)
-    ee_pose = np.eye(4, dtype=np.float32)
-    for i in range(model.nbody):
-        name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_BODY, i) or ""
-        if name == "robotiq_base":
-            ee_pose[:3, 3] = data.xpos[i].astype(np.float32)
-            ee_pose[:3, :3] = data.xmat[i].reshape(3, 3).astype(np.float32)
-            break
-    log.info(f"UR5e+Robotiq loaded: {len(meshes)} meshes, EE pos={ee_pose[:3, 3]}")
-    return meshes, ee_pose
 def _parse_urdf_origin(origin_element) -> np.ndarray:
     transform = np.eye(4, dtype=np.float32)
     if origin_element is None:
@@ -856,64 +777,4 @@ def get_robot_loaders() -> dict[str, callable]:
         "google_robot": _load_google_robot,
         "franka_panda": _load_franka_panda,
         "widowx": _load_widowx,
-        "ur5e": _load_ur5e,
     }
-def extract_gripper_openings(unified_57d: np.ndarray, robot_name: str = "google_robot") -> np.ndarray:
-    """Extract gripper opening fractions from unified action grasp state.
-    Uses fingertip spread (f0-f1 distance) to invert the FK and recover
-    the scalar gripper opening at each timestep.
-    Args:
-        unified_57d: (T, 57) unified action.
-        robot_name: Robot identifier for FK lookup table.
-    Returns:
-        (T+1,) array of gripper openings in [0, 1].
-    """
-    T = unified_57d.shape[0]
-    grasp = unified_57d[:, 18:33].reshape(T, 5, 3)
-    all_grasp = np.concatenate([grasp[0:1], grasp], axis=0)
-    # Build monotonic inverse lookup from FK (robot-specific)
-    _gs = np.linspace(0, 1, 10001).astype(np.float32)
-    try:
-        if robot_name == "franka_panda":
-            from cosmos_framework.data.vfm.action.robot_descriptions.franka import franka_fingertip_fk
-            _tips = franka_fingertip_fk(_gs)
-        elif robot_name == "widowx":
-            from cosmos_framework.data.vfm.action.robot_descriptions.widowx import widowx_fingertip_fk
-            _tips = widowx_fingertip_fk(_gs)
-        elif robot_name == "ur5e":
-            from cosmos_framework.data.vfm.action.robot_descriptions.umi import _WSG50_MAX_WIDTH, umi_fingertip_fk
-            _tips = umi_fingertip_fk(_gs * _WSG50_MAX_WIDTH)
-        else:
-            from cosmos_framework.data.vfm.action.robot_descriptions.google_robot import (
-                google_robot_fingertip_fk_vectorized,
-            )
-            _tips = google_robot_fingertip_fk_vectorized(_gs)
-        _spreads = np.linalg.norm(_tips[:, 0] - _tips[:, 1], axis=1)
-    except ImportError:
-        # Fallback: linear approximation
-        _spreads = _gs * 0.145
-    _min_idx = int(np.argmin(_spreads))
-    mono_gs = _gs[_min_idx:]
-    mono_spreads = _spreads[_min_idx:]
-    openings = np.zeros(T + 1, dtype=np.float32)
-    for t in range(T + 1):
-        f0, f1 = all_grasp[t, 0], all_grasp[t, 1]
-        spread = np.linalg.norm(f0 - f1)
-        if spread <= mono_spreads[0]:
-            openings[t] = 0.0
-        elif spread >= mono_spreads[-1]:
-            openings[t] = 1.0
-        else:
-            openings[t] = float(np.interp(spread, mono_spreads, mono_gs))
-    return openings

     "gripper",  # Google Robot (Menagerie site)
     "hand",  # Franka Panda (standard gripper)
     "attachment",  # Franka + Robotiq
     "wx250s/gripper_link",  # WidowX 250S (Menagerie)
     "ee_link",  # generic
     "tool0",  # generic industrial
         "finger_joint_names": ["left_finger", "right_finger"],
         "camera_body": None,
     },
 }
     if robot_name == "droid_franka_robotiq":
         return _build_droid_franka_robotiq()
     log.info(f"Downloading {robot_name} assets from mujoco_menagerie...")
     _download_menagerie_model(robot_name)
     return cached_dir
     return dst
 # ── Robot loaders ────────────────────────────────────────────────────────────
     return meshes, ee_pose
 def _parse_urdf_origin(origin_element) -> np.ndarray:
     transform = np.eye(4, dtype=np.float32)
     if origin_element is None:
         "google_robot": _load_google_robot,
         "franka_panda": _load_franka_panda,
         "widowx": _load_widowx,
     }

cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py CHANGED Viewed

@@ -4,11 +4,8 @@
 """Interactive 3D viewer for robot action datasets.
-Uses the unified 57D action representation: every dataset declares one explicit
-raw ``ActionFormat`` (9D/10D/20D/57D), which is converted to
-``UnifiedAction(action_57d, mask)`` before rendering.
-**57D layout**: ``[ego(9) | R_wrist(9) | R_fingers(15) | L_wrist(9) | L_fingers(15)]``
 Dependencies::
@@ -18,8 +15,6 @@ Usage:
     # Use each dataset's declared raw action format:
     uv run python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --share
-    # Override the raw action format explicitly:
-    uv run python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --action-format 57d --share
 """
 from __future__ import annotations
@@ -92,7 +87,6 @@ def _lazycfg_to_entry(
     dataset_class = target if isinstance(target, str) else f"{target.__module__}.{target.__qualname__}"
     ds_items = ds_cfg.items() if isinstance(ds_cfg, dict) else ds_cfg.items()
     dataset_kwargs = {key: value for key, value in ds_items if key != "_target_"}
-    dataset_kwargs["action_normalization"] = None
     if viewer_overrides is not None:
         dataset_kwargs.update(viewer_overrides)
@@ -131,8 +125,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
         DATASET_UMI_256,
     )
-    raw_action_override = {"action_normalization": None}
     from cosmos_framework.data.vfm.action.bridge_orig_lerobot_dataset import _BRIDGE_TO_OPENCV
     from cosmos_framework.data.vfm.action.droid_lerobot_dataset import _DROID_TO_OPENCV
     from cosmos_framework.data.vfm.action.fractal import _GOOGLE_ROBOT_TO_OPENCV
@@ -147,7 +139,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
             max_finger_width=0.0,
             fps=10,
             action_format=ActionFormat.EGO_9D,
-            viewer_overrides=raw_action_override,
         ),
         "fractal": _lazycfg_to_entry(
             DATASET_FRACTAL_256,
@@ -158,7 +149,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
             camera_fov_deg=69.0,
             camera_aspect=320 / 256,
             to_opencv=_GOOGLE_ROBOT_TO_OPENCV,
-            viewer_overrides=raw_action_override,
         ),
         "bridge": _lazycfg_to_entry(
             DATASET_BRIDGE_480,
@@ -167,7 +157,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
             fps=5,
             action_format=ActionFormat.SINGLE_ARM_10D,
             to_opencv=_BRIDGE_TO_OPENCV,
-            viewer_overrides=raw_action_override,
         ),
         "droid": _lazycfg_to_entry(
             DATASET_DROID_480,
@@ -176,7 +165,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
             fps=15,
             action_format=ActionFormat.SINGLE_ARM_10D,
             to_opencv=_DROID_TO_OPENCV,
-            viewer_overrides=raw_action_override,
         ),
         "umi": _lazycfg_to_entry(
             DATASET_UMI_256,
@@ -184,7 +172,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
             max_finger_width=0.0,
             fps=20,
             action_format=ActionFormat.SINGLE_ARM_10D,
-            viewer_overrides=raw_action_override,
         ),
         "robomind_franka": _lazycfg_to_entry(
             DATASET_ROBOMIND_FRANKA_480,
@@ -193,7 +180,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
             fps=10,
             action_format=ActionFormat.SINGLE_ARM_10D,
             to_opencv=_FRANKA_TO_OPENCV,
-            viewer_overrides=raw_action_override,
         ),
         "robomind_franka_dual": _lazycfg_to_entry(
             DATASET_ROBOMIND_FRANKA_DUAL_480,
@@ -210,7 +196,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
                 dtype=np.float32,
             ),
             to_opencv=_FRANKA_TO_OPENCV,
-            viewer_overrides=raw_action_override,
         ),
     }
@@ -238,7 +223,7 @@ def _create_dataset(entry: DatasetEntry, chunk_length: int):
     # UMI: factory function
     if callable(cls) and not inspect.isclass(cls):
-        _OMEGACONF_BLOCKLIST = {"chunk_length", "split", "action_normalization", "enable_fast_init"}
         kwargs = {k: v for k, v in kwargs.items() if k not in _OMEGACONF_BLOCKLIST}
         kwargs["eager_load"] = True
         return cls(**kwargs)
@@ -312,17 +297,6 @@ def _build_viewer_idle_action_spec(action_format: ActionFormat) -> Any:
             Rot("rot6d", prefix="right"),
             Gripper(prefix="right"),
         )
-    if action_format is ActionFormat.UNIFIED_57D:
-        return build_action_spec(
-            Pos(prefix="ego"),
-            Rot("rot6d", prefix="ego"),
-            Pos(prefix="right_wrist"),
-            Rot("rot6d", prefix="right_wrist"),
-            Pos(dim=15, prefix="right_fingers"),
-            Pos(prefix="left_wrist"),
-            Rot("rot6d", prefix="left_wrist"),
-            Pos(dim=15, prefix="left_fingers"),
-        )
     raise ValueError(f"Unsupported action format for idle-frame detection: {action_format}")
@@ -622,7 +596,7 @@ def launch_viewer(
         cam_panel = client.gui.add_image(np.zeros((64, 64, 3), dtype=np.uint8))
         renderer.set_video_panel(cam_panel)
-        with client.gui.add_folder("Action (57D)"):
             action_text = client.gui.add_markdown("*No episode loaded*")
         show = {
@@ -645,7 +619,7 @@ def launch_viewer(
         )
         def _update_action_text(t: int) -> None:
-            """Update the 57D action display for one client."""
             txt = renderer.format_action_text(t)
             action_text.content = txt if txt else "*No data*"
@@ -780,14 +754,6 @@ def launch_viewer(
                 )
                 state.video = get_video_from_sample(sample)
-                # Inject FK joint configs when the dataset provides them (e.g. UR).
-                jc = sample.get("joint_configs")
-                if jc is not None:
-                    state.joint_configs = (
-                        jc.numpy().astype(np.float32)
-                        if isinstance(jc, torch.Tensor)
-                        else np.asarray(jc, dtype=np.float32)
-                    )
                 status_text.content = "⏳ Loading robot animation..."
                 renderer.load(state, entry, to_opencv=to_opencv)
                 _rebuild_robot_frame_toggles()
@@ -805,7 +771,7 @@ def launch_viewer(
                     + (f"Task: {ai_caption_text}\n\n" if ai_caption_text else "")
                     + (f"Debug: {debug_caption_text}\n\n" if debug_caption_text else "")
                     + (
-                        f"Steps: {T} | Raw: {raw_action_label} ({action_raw.shape[-1]}D) → 57D | "
                         f"Robot: {entry.robot_name or '—'} | FPS: {entry.fps}"
                     )
                 )
@@ -947,7 +913,7 @@ def launch_viewer(
 def main():
-    parser = argparse.ArgumentParser(description="Action dataset viewer (unified 57D)")
     parser.add_argument("--port", type=int, default=8013)
     parser.add_argument("--share", action="store_true")
     parser.add_argument("--chunk-length", type=int, default=16)

 """Interactive 3D viewer for robot action datasets.
+Uses raw release action layouts: 9D ego/camera, 10D single-arm, and 20D dual-arm.
+Each sample is converted to a small internal render state before drawing.
 Dependencies::
     # Use each dataset's declared raw action format:
     uv run python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --share
 """
 from __future__ import annotations
     dataset_class = target if isinstance(target, str) else f"{target.__module__}.{target.__qualname__}"
     ds_items = ds_cfg.items() if isinstance(ds_cfg, dict) else ds_cfg.items()
     dataset_kwargs = {key: value for key, value in ds_items if key != "_target_"}
     if viewer_overrides is not None:
         dataset_kwargs.update(viewer_overrides)
         DATASET_UMI_256,
     )
     from cosmos_framework.data.vfm.action.bridge_orig_lerobot_dataset import _BRIDGE_TO_OPENCV
     from cosmos_framework.data.vfm.action.droid_lerobot_dataset import _DROID_TO_OPENCV
     from cosmos_framework.data.vfm.action.fractal import _GOOGLE_ROBOT_TO_OPENCV
             max_finger_width=0.0,
             fps=10,
             action_format=ActionFormat.EGO_9D,
         ),
         "fractal": _lazycfg_to_entry(
             DATASET_FRACTAL_256,
             camera_fov_deg=69.0,
             camera_aspect=320 / 256,
             to_opencv=_GOOGLE_ROBOT_TO_OPENCV,
         ),
         "bridge": _lazycfg_to_entry(
             DATASET_BRIDGE_480,
             fps=5,
             action_format=ActionFormat.SINGLE_ARM_10D,
             to_opencv=_BRIDGE_TO_OPENCV,
         ),
         "droid": _lazycfg_to_entry(
             DATASET_DROID_480,
             fps=15,
             action_format=ActionFormat.SINGLE_ARM_10D,
             to_opencv=_DROID_TO_OPENCV,
         ),
         "umi": _lazycfg_to_entry(
             DATASET_UMI_256,
             max_finger_width=0.0,
             fps=20,
             action_format=ActionFormat.SINGLE_ARM_10D,
         ),
         "robomind_franka": _lazycfg_to_entry(
             DATASET_ROBOMIND_FRANKA_480,
             fps=10,
             action_format=ActionFormat.SINGLE_ARM_10D,
             to_opencv=_FRANKA_TO_OPENCV,
         ),
         "robomind_franka_dual": _lazycfg_to_entry(
             DATASET_ROBOMIND_FRANKA_DUAL_480,
                 dtype=np.float32,
             ),
             to_opencv=_FRANKA_TO_OPENCV,
         ),
     }
     # UMI: factory function
     if callable(cls) and not inspect.isclass(cls):
+        _OMEGACONF_BLOCKLIST = {"chunk_length", "split", "enable_fast_init"}
         kwargs = {k: v for k, v in kwargs.items() if k not in _OMEGACONF_BLOCKLIST}
         kwargs["eager_load"] = True
         return cls(**kwargs)
             Rot("rot6d", prefix="right"),
             Gripper(prefix="right"),
         )
     raise ValueError(f"Unsupported action format for idle-frame detection: {action_format}")
         cam_panel = client.gui.add_image(np.zeros((64, 64, 3), dtype=np.uint8))
         renderer.set_video_panel(cam_panel)
+        with client.gui.add_folder("Action"):
             action_text = client.gui.add_markdown("*No episode loaded*")
         show = {
         )
         def _update_action_text(t: int) -> None:
+            """Update the action display for one client."""
             txt = renderer.format_action_text(t)
             action_text.content = txt if txt else "*No data*"
                 )
                 state.video = get_video_from_sample(sample)
                 status_text.content = "⏳ Loading robot animation..."
                 renderer.load(state, entry, to_opencv=to_opencv)
                 _rebuild_robot_frame_toggles()
                     + (f"Task: {ai_caption_text}\n\n" if ai_caption_text else "")
                     + (f"Debug: {debug_caption_text}\n\n" if debug_caption_text else "")
                     + (
+                        f"Steps: {T} | Raw action: {raw_action_label} ({action_raw.shape[-1]}D) | "
                         f"Robot: {entry.robot_name or '—'} | FPS: {entry.fps}"
                     )
                 )
 def main():
+    parser = argparse.ArgumentParser(description="Action dataset viewer")
     parser.add_argument("--port", type=int, default=8013)
     parser.add_argument("--share", action="store_true")
     parser.add_argument("--chunk-length", type=int, default=16)