Spaces:

nvidia
/

Cosmos3-Action-Viewer

Running

App Files Files Community

XinKongCosmos commited on about 5 hours ago

Commit

381b35a

verified ·

1 Parent(s): 3264a6d

Deep trim viewer-only release

Browse files

Files changed (13) hide show

cosmos-framework/cosmos_framework/data/imaginaire/__init__.py +0 -0
cosmos-framework/cosmos_framework/data/imaginaire/webdataset/__init__.py +0 -0
cosmos-framework/cosmos_framework/data/vfm/action/action_spec.py +0 -235
cosmos-framework/cosmos_framework/data/vfm/action/av_dataset.py +18 -23
cosmos-framework/cosmos_framework/data/vfm/action/bridge_orig_lerobot_dataset.py +0 -9
cosmos-framework/cosmos_framework/data/vfm/action/camera_dataset.py +0 -15
cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py +2 -171
cosmos-framework/cosmos_framework/data/vfm/action/domain_utils.py +0 -29
cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset.py +0 -14
cosmos-framework/cosmos_framework/data/vfm/action/fractal.py +0 -9
cosmos-framework/cosmos_framework/data/vfm/action/pose_utils.py +0 -206
cosmos-framework/cosmos_framework/data/vfm/action/robomind_franka_dataset.py +1 -41
cosmos-framework/cosmos_framework/data/vfm/action/umi_lerobot_dataset.py +0 -4

cosmos-framework/cosmos_framework/data/imaginaire/__init__.py DELETED Viewed

File without changes

cosmos-framework/cosmos_framework/data/imaginaire/webdataset/__init__.py DELETED Viewed

File without changes

cosmos-framework/cosmos_framework/data/vfm/action/action_spec.py DELETED Viewed

@@ -1,235 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-"""Action-vector specification: per-dim type label + idle thresholds.
-Single concept: every column of an action vector has a :class:`DimType` label.
-Idle detection iterates by type and applies the matching algorithm:
-    POS      → ‖action[pos_idx]‖ per arm < eps_t
-    ROT      → distance(rot, identity) per group < eps_r
-    GRIPPER  → max |Δgripper| < eps_g (frame 0 idle by convention)
-    JOINT    → max |Δjoint|   < joint_threshold (frame 0 idle)
-    RESERVED → ignored
-An :class:`ActionSpec` is just ``names`` + ``types`` + ``rotation_format``.
-Build one declaratively via :func:`build_action_spec` from DSL components::
-    build_action_spec(Pos(), Rot("rot6d"), Gripper())             # 10D single arm
-    build_action_spec(Pos(), Rot("rot6d"))                        # 9D no gripper
-    build_action_spec(Joint(n=14, label="arm"),                   # 30D joint-space
-                      Joint(n=14, label="end"),
-                      Joint(n=2,  label="gripper"))
-    build_action_spec(Pos(prefix="left"),  Rot("rot6d", "left"),  Gripper(prefix="left"),
-                      Pos(prefix="right"), Rot("rot6d", "right"), Gripper(prefix="right"))
-Naming convention:
-    Default ``pos_x``, ``rot_0``, ``gripper``, ``arm_0`` ...
-    With ``prefix="left"`` (idempotent on trailing ``_``): ``left_pos_x`` ...
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-from enum import Enum
-from typing import ClassVar
-from cosmos_framework.data.vfm.action.pose_utils import (
-    RotationConvention,
-    _identity_rotation_vector,
-)
-class DimType(str, Enum):
-    """Per-column action-dim category (drives idle detection)."""
-    POS = "pos"
-    ROT = "rot"
-    GRIPPER = "gripper"
-    JOINT = "joint"
-    RESERVED = "reserved"
-@dataclass(frozen=True, slots=True)
-class ActionSpec:
-    """Structural description of an action vector: names + per-dim types.
-    All ROT dims share a single ``rotation_format``; mixed formats in one spec
-    are not supported (raise at build time).
-    This struct contains no detection thresholds — those are passed at call
-    time to :func:`compute_idle_frames` so each dataset can tune them
-    independently of layout.
-    """
-    names: list[str]
-    types: list[DimType]
-    rotation_format: RotationConvention = "rot6d"
-    @property
-    def dim(self) -> int:
-        return len(self.names)
-# ---------------------------------------------------------------------------
-# DSL components
-# ---------------------------------------------------------------------------
-def _join_prefix(prefix: str, name: str) -> str:
-    """Join ``prefix`` and ``name`` with a single ``_``; idempotent on trailing ``_``."""
-    return name if not prefix else f"{prefix.rstrip('_')}_{name}"
-@dataclass(frozen=True)
-class Pos:
-    """Translation block.
-    Default 3D (``pos_x``, ``pos_y``, ``pos_z``). For planar tasks (e.g. PushT)
-    use ``Pos(dim=2)`` → ``pos_x``, ``pos_y``. ``dim >= 4`` falls back to
-    indexed names ``pos_0``, ``pos_1``, ...
-    """
-    dim: int = 3
-    prefix: str = ""
-    type: ClassVar[DimType] = DimType.POS
-    def names(self) -> list[str]:
-        if self.dim <= 3:
-            return [_join_prefix(self.prefix, f"pos_{c}") for c in "xyz"[: self.dim]]
-        return [_join_prefix(self.prefix, f"pos_{i}") for i in range(self.dim)]
-@dataclass(frozen=True)
-class Rot:
-    """Rotation block; ``format`` selects the encoding.
-    Supported formats and per-dim names:
-    - ``rot6d``      → 6 dims, ``rot_0`` ... ``rot_5``     (identity ``[1,0,0,0,1,0]``)
-    - ``rot9d``      → 9 dims, ``rot_0`` ... ``rot_8``     (identity ``[1,0,0,0,1,0,0,0,1]``)
-    - ``euler_xyz``  → 3 dims, ``roll``, ``pitch``, ``yaw`` (identity ``[0,0,0]``)
-    - ``axisangle``  → 3 dims, ``axang_x/y/z``              (identity ``[0,0,0]``)
-    - ``quat_xyzw`` / ``quat_wxyz`` → 4 dims, ``quat_x/y/z/w`` in declared order
-    """
-    format: RotationConvention = "rot6d"
-    prefix: str = ""
-    type: ClassVar[DimType] = DimType.ROT
-    @property
-    def rotation_format(self) -> RotationConvention:
-        return self.format
-    @property
-    def dim(self) -> int:
-        return _identity_rotation_vector(self.format).shape[0]
-    def names(self) -> list[str]:
-        if self.format == "euler_xyz":
-            return [_join_prefix(self.prefix, c) for c in ("roll", "pitch", "yaw")]
-        if self.format == "axisangle":
-            return [_join_prefix(self.prefix, f"axang_{c}") for c in "xyz"]
-        if self.format.startswith("quat_"):
-            order = self.format.split("_", 1)[1]  # "xyzw" or "wxyz"
-            return [_join_prefix(self.prefix, f"quat_{c}") for c in order]
-        return [_join_prefix(self.prefix, f"rot_{i}") for i in range(self.dim)]
-@dataclass(frozen=True)
-class Gripper:
-    """1D gripper command (binary 0/1 or continuous). Detected by frame-diff."""
-    prefix: str = ""
-    type: ClassVar[DimType] = DimType.GRIPPER
-    @property
-    def dim(self) -> int:
-        return 1
-    def names(self) -> list[str]:
-        return [_join_prefix(self.prefix, "gripper")]
-@dataclass(frozen=True)
-class Joint:
-    """``n`` joint commands. Detected by frame-diff against ``joint_threshold``."""
-    n: int = 0
-    label: str = "joint"
-    prefix: str = ""
-    type: ClassVar[DimType] = DimType.JOINT
-    @property
-    def dim(self) -> int:
-        return self.n
-    def names(self) -> list[str]:
-        return [_join_prefix(self.prefix, f"{self.label}_{i}") for i in range(self.n)]
-@dataclass(frozen=True)
-class Reserved:
-    """``n`` dims counted in ``action_dim`` but ignored by idle detection."""
-    n: int = 0
-    label: str = "reserved"
-    prefix: str = ""
-    type: ClassVar[DimType] = DimType.RESERVED
-    @property
-    def dim(self) -> int:
-        return self.n
-    def names(self) -> list[str]:
-        return [_join_prefix(self.prefix, f"{self.label}_{i}") for i in range(self.n)]
-# ---------------------------------------------------------------------------
-# Builder
-# ---------------------------------------------------------------------------
-# Type alias for any DSL component. Not a runtime check — only annotation hint.
-Component = Pos | Rot | Gripper | Joint | Reserved
-def build_action_spec(*components: Component) -> ActionSpec:
-    """Compose ``components`` into an :class:`ActionSpec`.
-    Each component contributes its ``names()`` and replicates its ``type`` for
-    every column it occupies. The first ROT component's ``rotation_format``
-    is captured for the whole spec; mixing formats raises ``ValueError``.
-    """
-    names: list[str] = []
-    types: list[DimType] = []
-    rotation_format: RotationConvention | None = None
-    for c in components:
-        names.extend(c.names())
-        types.extend([c.type] * c.dim)
-        if c.type == DimType.ROT:
-            fmt = c.rotation_format  # type: ignore[union-attr]
-            if rotation_format is None:
-                rotation_format = fmt
-            elif rotation_format != fmt:
-                raise ValueError(f"Mixed rotation_format in one ActionSpec: {rotation_format!r} vs {fmt!r}")
-    return ActionSpec(
-        names=names,
-        types=types,
-        rotation_format=rotation_format or "rot6d",
-    )
-__all__ = [
-    "ActionSpec",
-    "Component",
-    "DimType",
-    "Gripper",
-    "Joint",
-    "Pos",
-    "Reserved",
-    "Rot",
-    "build_action_spec",
-]

cosmos-framework/cosmos_framework/data/vfm/action/av_dataset.py CHANGED Viewed

@@ -37,8 +37,6 @@ from torch.utils.data import IterableDataset
 # torch.multiprocessing.set_sharing_strategy("file_system")
 from cosmos_framework.utils import log
 from cosmos_framework.utils.easy_io import easy_io
-from cosmos_framework.data.vfm.action.camera_dataset import get_target_size_and_crop
-from cosmos_framework.data.vfm.action.domain_utils import get_domain_id
 from cosmos_framework.data.vfm.action.pose_utils import (
     RotationConvention,
     build_abs_pose_from_components,
@@ -46,6 +44,24 @@ from cosmos_framework.data.vfm.action.pose_utils import (
 )
 def decode_video_bytes(
     video_bytes: bytes,
     resolution: str | None = None,
@@ -509,7 +525,6 @@ class AVDataset(IterableDataset):
         resolution: str | None = None,
         fps: int = 10,
         mode: str = "policy",
-        embodiment_type: str = "av",
         split: str = "train",
         seed: int = 0,
         shuffle: bool = True,
@@ -529,11 +544,6 @@ class AVDataset(IterableDataset):
         rotation_scale: float = 1.0,
         max_action_translation_norm: float | None = None,
         align_opencv_pose: bool = False,
-        # When True, use a separate domain ID for inverse dynamics / policy modes
-        # so that DomainAwareLinear learns different projections for anchored (conditioning)
-        # vs framewise (generation) action representations.
-        mode_aware_domain: bool = False,
-        inv_embodiment_type: str = "av_inv",
     ):
         """Initialize AVDataset.
@@ -543,7 +553,6 @@ class AVDataset(IterableDataset):
             resolution: Target resolution for video frames (e.g. "256", "480"). If None, keeps original resolution.
             fps: Target frames per second for video and actions.
             mode: Training mode ('policy', 'forward_dynamics', 'inverse_dynamics', 'image2video', 'joint').
-            embodiment_type: Embodiment type for domain ID.
             split: Dataset split ('train', 'val', or 'full').
             seed: Random seed for shuffling.
             shuffle: Whether to shuffle tar files during iteration (for training).
@@ -570,8 +579,6 @@ class AVDataset(IterableDataset):
             align_opencv_pose: If True, transform pose rotations from car body-frame
                 convention (x=forward, y=left, z=up) to OpenCV camera convention
                 (x=right, y=down, z=forward) before computing relative actions.
-            mode_aware_domain: When True, inverse_dynamics/policy modes use a separate domain ID.
-            inv_embodiment_type: Embodiment type string for the inverse domain ID.
         """
         super().__init__()
@@ -602,11 +609,6 @@ class AVDataset(IterableDataset):
         self.rotation_scale = rotation_scale
         self.max_action_translation_norm = max_action_translation_norm
         self.align_opencv_pose = align_opencv_pose
-        # Get domain ID for this embodiment
-        self.domain_id = get_domain_id(embodiment_type)
-        self.mode_aware_domain = mode_aware_domain
-        self.domain_id_inv = get_domain_id(inv_embodiment_type) if mode_aware_domain else self.domain_id
         # Validate mode
         valid_modes = ["joint", "forward_dynamics", "inverse_dynamics", "policy", "image2video"]
         if mode not in valid_modes:
@@ -864,11 +866,6 @@ class AVDataset(IterableDataset):
                 )
         # prompt += f"Predict the future {future_duration:.1f}s action trajectory at {self.fps}Hz."
-        # Select domain ID: use inverse domain for generation modes when mode_aware_domain is on
-        if self.mode_aware_domain and mode in ["inverse_dynamics", "policy"]:
-            domain_id = self.domain_id_inv
-        else:
-            domain_id = self.domain_id
         sample = {
             "video": video,
@@ -881,7 +878,6 @@ class AVDataset(IterableDataset):
             "ai_caption": prompt,
             "mode": mode,
             "__key__": key_tensor,
-            "domain_id": torch.tensor(domain_id, dtype=torch.long),
             "history_length": actual_history_length,
             "future_length": actual_future_length,
             "viewpoint": "ego_view",
@@ -1001,7 +997,6 @@ if __name__ == "__main__":
         print(f"{'future_length':<25}: {data['future_length']}")
         print(f"{'conditioning_fps':<25}: {data['conditioning_fps'].item()}")
         print(f"{'mode':<25}: {data['mode']}")
-        print(f"{'domain_id':<25}: {data['domain_id'].item()}")
         print(f"{'prompt':<25}: {data['prompt']}")
         # save video

 # torch.multiprocessing.set_sharing_strategy("file_system")
 from cosmos_framework.utils import log
 from cosmos_framework.utils.easy_io import easy_io
 from cosmos_framework.data.vfm.action.pose_utils import (
     RotationConvention,
     build_abs_pose_from_components,
 )
+VIDEO_RES_SIZE_INFO: dict[str, dict[str, tuple[int, int]]] = {
+    "256": {"1,1": (256, 256), "4,3": (320, 256), "3,4": (256, 320), "16,9": (320, 192), "9,16": (192, 320)},
+    "480": {"1,1": (640, 640), "4,3": (736, 544), "3,4": (544, 736), "16,9": (832, 480), "9,16": (480, 832)},
+}
+def get_target_size_and_crop(resolution: str, current_H: int, current_W: int) -> tuple[int, int, int, int]:
+    target_resolutions = VIDEO_RES_SIZE_INFO[resolution]
+    current_ar = current_W / current_H
+    best_key = min(
+        target_resolutions,
+        key=lambda key: abs((int(key.split(",")[0]) / int(key.split(",")[1])) - current_ar),
+    )
+    target_canvas_W, target_canvas_H = target_resolutions[best_key]
+    scaling_ratio = max(target_canvas_W / current_W, target_canvas_H / current_H)
+    return int(scaling_ratio * current_H + 0.5), int(scaling_ratio * current_W + 0.5), target_canvas_H, target_canvas_W
 def decode_video_bytes(
     video_bytes: bytes,
     resolution: str | None = None,
         resolution: str | None = None,
         fps: int = 10,
         mode: str = "policy",
         split: str = "train",
         seed: int = 0,
         shuffle: bool = True,
         rotation_scale: float = 1.0,
         max_action_translation_norm: float | None = None,
         align_opencv_pose: bool = False,
     ):
         """Initialize AVDataset.
             resolution: Target resolution for video frames (e.g. "256", "480"). If None, keeps original resolution.
             fps: Target frames per second for video and actions.
             mode: Training mode ('policy', 'forward_dynamics', 'inverse_dynamics', 'image2video', 'joint').
             split: Dataset split ('train', 'val', or 'full').
             seed: Random seed for shuffling.
             shuffle: Whether to shuffle tar files during iteration (for training).
             align_opencv_pose: If True, transform pose rotations from car body-frame
                 convention (x=forward, y=left, z=up) to OpenCV camera convention
                 (x=right, y=down, z=forward) before computing relative actions.
         """
         super().__init__()
         self.rotation_scale = rotation_scale
         self.max_action_translation_norm = max_action_translation_norm
         self.align_opencv_pose = align_opencv_pose
         # Validate mode
         valid_modes = ["joint", "forward_dynamics", "inverse_dynamics", "policy", "image2video"]
         if mode not in valid_modes:
                 )
         # prompt += f"Predict the future {future_duration:.1f}s action trajectory at {self.fps}Hz."
         sample = {
             "video": video,
             "ai_caption": prompt,
             "mode": mode,
             "__key__": key_tensor,
             "history_length": actual_history_length,
             "future_length": actual_future_length,
             "viewpoint": "ego_view",
         print(f"{'future_length':<25}: {data['future_length']}")
         print(f"{'conditioning_fps':<25}: {data['conditioning_fps'].item()}")
         print(f"{'mode':<25}: {data['mode']}")
         print(f"{'prompt':<25}: {data['prompt']}")
         # save video

cosmos-framework/cosmos_framework/data/vfm/action/bridge_orig_lerobot_dataset.py CHANGED Viewed

@@ -18,12 +18,7 @@ from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
-    ActionSpec,
     BaseActionLeRobotDataset,
-    Gripper,
-    Pos,
-    Rot,
-    build_action_spec,
 )
 from cosmos_framework.data.vfm.action.pose_utils import (
     PoseConvention,
@@ -111,7 +106,6 @@ class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset):
             split_val_ratio=split_val_ratio,
             split=split,
             mode=mode,
-            embodiment_type="bridge_orig_lerobot",
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
@@ -240,9 +234,6 @@ class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset):
     # __getitem__
     # ------------------------------------------------------------------
-    def _build_action_spec(self) -> ActionSpec:
-        """Bridge: 10D = ``[Pos, Rot6d, Gripper]``."""
-        return build_action_spec(Pos(), Rot("rot6d"), Gripper())
     def __getitem__(self, idx: int) -> dict[str, Any]:
         """ """

 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
     BaseActionLeRobotDataset,
 )
 from cosmos_framework.data.vfm.action.pose_utils import (
     PoseConvention,
             split_val_ratio=split_val_ratio,
             split=split,
             mode=mode,
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
     # __getitem__
     # ------------------------------------------------------------------
     def __getitem__(self, idx: int) -> dict[str, Any]:
         """ """

cosmos-framework/cosmos_framework/data/vfm/action/camera_dataset.py DELETED Viewed

@@ -1,15 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-VIDEO_RES_SIZE_INFO: dict[str, dict[str, tuple[int, int]]] = {
-    "256": {"1,1": (256, 256), "4,3": (320, 256), "3,4": (256, 320), "16,9": (320, 192), "9,16": (192, 320)},
-    "480": {"1,1": (640, 640), "4,3": (736, 544), "3,4": (544, 736), "16,9": (832, 480), "9,16": (480, 832)},
-}
-def get_target_size_and_crop(resolution: str, current_H: int, current_W: int) -> tuple[int, int, int, int]:
-    target_resolutions = VIDEO_RES_SIZE_INFO[resolution]
-    current_ar = current_W / current_H
-    best_key = min(target_resolutions, key=lambda key: abs((int(key.split(',')[0]) / int(key.split(',')[1])) - current_ar))
-    target_canvas_W, target_canvas_H = target_resolutions[best_key]
-    scaling_ratio = max(target_canvas_W / current_W, target_canvas_H / current_H)
-    return int(scaling_ratio * current_H + 0.5), int(scaling_ratio * current_W + 0.5), target_canvas_H, target_canvas_W

cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py CHANGED Viewed

@@ -14,7 +14,6 @@ from __future__ import annotations
 import importlib
 import logging as _logging
-import math
 import os as _os
 import random
 from bisect import bisect_right
@@ -53,23 +52,7 @@ def _ensure_hf_hub_offline() -> None:
     _hf_offline_applied = True
-from functools import cached_property
 from cosmos_framework.utils import log
-# Re-export the action_spec DSL from this module so that subclass datasets
-# only need a single import block (alongside ``BaseActionLeRobotDataset``).
-from cosmos_framework.data.vfm.action.action_spec import (  # noqa: F401  (re-export)
-    ActionSpec,
-    DimType,
-    Gripper,
-    Joint,
-    Pos,
-    Reserved,
-    Rot,
-    build_action_spec,
-)
-from cosmos_framework.data.vfm.action.domain_utils import get_domain_id
-from cosmos_framework.data.vfm.action.pose_utils import compute_idle_frames
 from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
 # ---------------------------------------------------------------------------
@@ -278,7 +261,6 @@ class BaseActionLeRobotDataset(Dataset):
         split_val_ratio: float,
         split: str,
         mode: str,
-        embodiment_type: str,
         viewpoint: Viewpoint,
         pose_convention: str | None = None,
         rotation_format: str | None = None,
@@ -301,7 +283,6 @@ class BaseActionLeRobotDataset(Dataset):
         self._split_val_ratio = split_val_ratio
         self._split = _normalize_split(split)
         self._mode = mode
-        self._embodiment_type = embodiment_type
         self._viewpoint: Viewpoint = viewpoint
         self._pose_convention = pose_convention
         self._rotation_format = rotation_format
@@ -331,7 +312,6 @@ class BaseActionLeRobotDataset(Dataset):
         self._episode_records: list[tuple[int, int, int, int]] = []
         self._episode_cum_ends: list[int] = []
         self._num_valid_indices = 0
-        self._domain_id = get_domain_id(self._embodiment_type)
         self._all_shard_roots: list[str] = []
     # -- public properties ---------------------------------------------------
@@ -356,9 +336,6 @@ class BaseActionLeRobotDataset(Dataset):
     def mode(self, value: str) -> None:
         self._mode = value
-    @property
-    def domain_id(self) -> int:
-        return self._domain_id
     # -- source registration -------------------------------------------------
@@ -679,138 +656,6 @@ class BaseActionLeRobotDataset(Dataset):
     # -- result building -----------------------------------------------------
-    def _build_action_spec(self) -> ActionSpec | None:
-        """Subclass override: declare this dataset's action layout.
-        Called once per instance — the result is cached by ``self.action_spec``.
-        Return ``None`` to skip spec-driven idle detection; in that case
-        ``_compute_idle_frames`` will log a one-time warning and return
-        ``None`` for every sample.
-        """
-        return None
-    @cached_property
-    def action_spec(self) -> ActionSpec | None:
-        """Cached :class:`ActionSpec` from ``_build_action_spec``.
-        Returns ``None`` when the subclass did not declare one; idle detection
-        is then skipped (with a one-time warning) until the subclass overrides
-        ``_build_action_spec``.
-        """
-        return self._build_action_spec()
-    @cached_property
-    def action_names(self) -> list[str] | None:
-        spec = self.action_spec
-        return spec.names if spec is not None else None
-    # Idle-detection thresholds. Defined as **velocities** (per second) so the
-    # same numeric value means the same physical motion across datasets with
-    # different sampling rates; converted to per-frame at call time using
-    # ``self._fps`` via :meth:`_resolve_idle_thresholds`.
-    #
-    # Defaults:
-    #   - ``idle_eps_t_per_sec``           = 5 mm/s   (≈ 1 mm/frame at 5 Hz)
-    #   - ``idle_eps_r_per_sec``           = 1.5°/s   (geodesic, rotation-format aware)
-    #   - ``idle_eps_g``                   = 1e-2     unit gripper Δ (no fps)
-    #   - ``idle_joint_threshold_per_sec`` = 5e-3 rad/s
-    #   - ``idle_min_streak``              = 3        require ≥ 3 consecutive
-    #
-    # Subclasses can either override the ``*_per_sec`` attributes (preferred —
-    # keeps the velocity semantics) or set the corresponding ``idle_eps_*`` /
-    # ``idle_joint_threshold`` attribute to a non-``None`` value to bypass the
-    # per-fps conversion entirely (raw per-frame override).
-    idle_eps_t_per_sec: float = 5e-3
-    idle_eps_r_per_sec: float = math.radians(1.5)
-    idle_eps_g: float = 1e-2
-    idle_joint_threshold_per_sec: float = 5e-3
-    idle_min_streak: int = 3
-    # Optional per-frame overrides. ``None`` (default) → use the ``*_per_sec``
-    # attribute / fps conversion above.
-    idle_eps_t: float | None = None
-    idle_eps_r: float | None = None
-    idle_joint_threshold: float | None = None
-    def _resolve_idle_thresholds(self) -> tuple[float, float, float, float]:
-        """Resolve per-frame idle thresholds for this dataset instance.
-        Returns ``(eps_t, eps_r, eps_g, joint_threshold)`` in raw per-frame
-        units. Honours direct per-frame overrides if the subclass sets the
-        non-``_per_sec`` attribute; otherwise scales the ``_per_sec`` values
-        by ``self._fps``.
-        """
-        fps = float(self._fps) if self._fps else 1.0
-        eps_t = self.idle_eps_t if self.idle_eps_t is not None else self.idle_eps_t_per_sec / fps
-        eps_r = self.idle_eps_r if self.idle_eps_r is not None else self.idle_eps_r_per_sec / fps
-        joint_thr = (
-            self.idle_joint_threshold
-            if self.idle_joint_threshold is not None
-            else self.idle_joint_threshold_per_sec / fps
-        )
-        return float(eps_t), float(eps_r), float(self.idle_eps_g), float(joint_thr)
-    def _compute_idle_frames(self, raw_action: torch.Tensor) -> torch.Tensor | None:
-        """Count idle frames in the *raw* (un-normalized) action chunk.
-        Requires ``self.action_spec`` to be declared via ``_build_action_spec``.
-        Returns ``None`` when:
-        - ``pose_convention`` is not ``"backward_framewise"`` (TODO: extend),
-        - the subclass has not declared an ``ActionSpec`` (logs a one-time warning),
-        - the action layout does not match the declared spec.
-        Detection thresholds come from the ``idle_eps_*`` class attributes
-        (overridable per dataset). Subclasses can also override this method
-        outright, or pass an explicit ``idle_frames`` integer via
-        ``**extras`` to :meth:`_build_result`.
-        """
-        # conventions (anchored / absolute) need different idle semantics.
-        if self._pose_convention != "backward_framewise":
-            if not getattr(self, "_warned_pose_convention", False):
-                log.warning(
-                    f"Dataset {self.__class__.__name__}: pose_convention="
-                    f"{self._pose_convention!r} is not 'backward_framewise'; "
-                    "skipping idle-frames detection. Centralize the dataset "
-                    "to backward_framewise to enable IdleFrames captioning."
-                )
-                self._warned_pose_convention = True
-            return None
-        spec = self.action_spec
-        if spec is None:
-            if not getattr(self, "_warned_no_action_spec", False):
-                log.warning(
-                    f"Dataset {self.__class__.__name__} has no action spec defined; "
-                    "skipping idle-frames detection. Override _build_action_spec() to enable it."
-                )
-                self._warned_no_action_spec = True
-            return None
-        eps_t, eps_r, eps_g, joint_thr = self._resolve_idle_thresholds()
-        try:
-            n = compute_idle_frames(
-                raw_action,
-                spec,
-                eps_t=eps_t,
-                eps_r=eps_r,
-                eps_g=eps_g,
-                joint_threshold=joint_thr,
-                min_streak=self.idle_min_streak,
-            )
-        except (ValueError, TypeError) as e:
-            if not getattr(self, "_warned_action_layout", False):
-                log.warning(
-                    f"Dataset {self.__class__.__name__}: action layout does "
-                    f"not match the declared ActionSpec "
-                    f"(action_dim={int(raw_action.shape[-1])}, "
-                    f"spec.dim={spec.dim}); skipping idle-frames detection. "
-                    f"Underlying error: {e}"
-                )
-                self._warned_action_layout = True
-            return None
-        return torch.tensor(n, dtype=torch.long)
     def _build_result(
         self,
         *,
@@ -823,25 +668,12 @@ class BaseActionLeRobotDataset(Dataset):
         """Assemble the common return dict for ``__getitem__``.
         ``video`` is expected in raw LeRobot layout before final formatting.
-        Subclasses may pass extra keys (e.g. ``initial_pose``) via ``**extras``.
-        ``idle_frames`` is auto-computed from the raw (un-normalized) ``action``
-        whenever the dataset's pose/rotation conventions allow it; subclasses
-        can override by passing ``idle_frames`` (int or scalar tensor) via
         ``**extras``.
         """
-        # Compute idle_frames from the raw action before normalization, unless
-        # the subclass has provided one explicitly via ``**extras``.
-        if "idle_frames" not in extras:
-            idle_frames = self._compute_idle_frames(action)
-            if idle_frames is not None:
-                extras = {"idle_frames": idle_frames, **extras}
         raw_action = action  # [T,D]
         if self._skip_video_loading:
-            result: dict[str, Any] = {"action": raw_action}
-            if "idle_frames" in extras:
-                result["idle_frames"] = extras["idle_frames"]
-            return result
         formatted_video = self._convert_video(video)  # [C,T,H,W] | None
         return {
             "ai_caption": ai_caption,
@@ -849,7 +681,6 @@ class BaseActionLeRobotDataset(Dataset):
             "action": raw_action,
             "conditioning_fps": torch.tensor(self._fps, dtype=torch.long),
             "mode": mode,
-            "domain_id": torch.tensor(self._domain_id, dtype=torch.long),
             "viewpoint": self._viewpoint,
             **extras,
         }

 import importlib
 import logging as _logging
 import os as _os
 import random
 from bisect import bisect_right
     _hf_offline_applied = True
 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
 # ---------------------------------------------------------------------------
         split_val_ratio: float,
         split: str,
         mode: str,
         viewpoint: Viewpoint,
         pose_convention: str | None = None,
         rotation_format: str | None = None,
         self._split_val_ratio = split_val_ratio
         self._split = _normalize_split(split)
         self._mode = mode
         self._viewpoint: Viewpoint = viewpoint
         self._pose_convention = pose_convention
         self._rotation_format = rotation_format
         self._episode_records: list[tuple[int, int, int, int]] = []
         self._episode_cum_ends: list[int] = []
         self._num_valid_indices = 0
         self._all_shard_roots: list[str] = []
     # -- public properties ---------------------------------------------------
     def mode(self, value: str) -> None:
         self._mode = value
     # -- source registration -------------------------------------------------
     # -- result building -----------------------------------------------------
     def _build_result(
         self,
         *,
         """Assemble the common return dict for ``__getitem__``.
         ``video`` is expected in raw LeRobot layout before final formatting.
+        Subclasses may pass extra viewer metadata (e.g. ``initial_pose``) via
         ``**extras``.
         """
         raw_action = action  # [T,D]
         if self._skip_video_loading:
+            return {"action": raw_action}
         formatted_video = self._convert_video(video)  # [C,T,H,W] | None
         return {
             "ai_caption": ai_caption,
             "action": raw_action,
             "conditioning_fps": torch.tensor(self._fps, dtype=torch.long),
             "mode": mode,
             "viewpoint": self._viewpoint,
             **extras,
         }

cosmos-framework/cosmos_framework/data/vfm/action/domain_utils.py DELETED Viewed

@@ -1,29 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-"""Domain ID helpers for cross-embodiment action datasets."""
-EMBODIMENT_TO_DOMAIN_ID: dict[str, int] = {
-    "no_action": 0,
-    "av": 1,
-    "camera_pose": 2,
-    "pusht": 4,
-    "umi": 6,
-    "bridge_orig_lerobot": 7,
-    "droid_lerobot": 8,
-    "robomind-franka": 8,  # Both Droid and RoboMIND-Franka are using robotiq and franka
-    "embodiment_b": 9,
-    "robomind-franka-dual": 12,
-    "fractal": 20,
-}
-def get_domain_id(embodiment_type: str) -> int:
-    """Get the domain ID for a given embodiment type."""
-    key = embodiment_type.lower().strip()
-    if key not in EMBODIMENT_TO_DOMAIN_ID:
-        raise KeyError(
-            f"Unknown embodiment type: {embodiment_type!r}. "
-            f"Available embodiments: {sorted(EMBODIMENT_TO_DOMAIN_ID.keys())}"
-        )
-    return EMBODIMENT_TO_DOMAIN_ID[key]

cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset.py CHANGED Viewed

@@ -11,13 +11,7 @@ from scipy.spatial.transform import Rotation as R
 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
-    ActionSpec,
     BaseActionLeRobotDataset,
-    Gripper,
-    Joint,
-    Pos,
-    Rot,
-    build_action_spec,
     build_episode_spans,
     split_episode_ids,
 )
@@ -87,7 +81,6 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
             split_val_ratio=split_val_ratio,
             split=split,
             mode=mode,
-            embodiment_type="droid_lerobot",
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
@@ -307,13 +300,6 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
         composite = torch.cat([wrist, bottom], dim=-2)  # [T,C,3H/2,W]
         return composite  # [T,C,3H/2,W]
-    def _build_action_spec(self) -> ActionSpec:
-        """DROID: 10D ``[Pos, Rot6d, Gripper]`` for ``ee_pose``,
-        8D ``[Joint(7), Gripper]`` for ``joint_pos``.
-        """
-        if self._action_space == "joint_pos":
-            return build_action_spec(Joint(n=7, label="joint"), Gripper())
-        return build_action_spec(Pos(), Rot("rot6d"), Gripper())
     def __getitem__(self, idx: int) -> dict[str, Any]:
         """ """

 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
     BaseActionLeRobotDataset,
     build_episode_spans,
     split_episode_ids,
 )
             split_val_ratio=split_val_ratio,
             split=split,
             mode=mode,
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
         composite = torch.cat([wrist, bottom], dim=-2)  # [T,C,3H/2,W]
         return composite  # [T,C,3H/2,W]
     def __getitem__(self, idx: int) -> dict[str, Any]:
         """ """

cosmos-framework/cosmos_framework/data/vfm/action/fractal.py CHANGED Viewed

@@ -15,12 +15,7 @@ from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
-    ActionSpec,
     BaseActionLeRobotDataset,
-    Gripper,
-    Pos,
-    Rot,
-    build_action_spec,
 )
 from cosmos_framework.data.vfm.action.pose_utils import (
     PoseConvention,
@@ -112,7 +107,6 @@ class FractalLeRobotDataset(BaseActionLeRobotDataset):
             split_val_ratio=split_val_ratio,
             split=split,
             mode=mode,
-            embodiment_type="fractal",
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
@@ -141,9 +135,6 @@ class FractalLeRobotDataset(BaseActionLeRobotDataset):
             )
         return kept
-    def _build_action_spec(self) -> ActionSpec:
-        """Fractal: 10D = ``[Pos(3), Rot6d(6), Gripper(1)]``."""
-        return build_action_spec(Pos(dim=3), Rot("rot6d"), Gripper())
     def __getitem__(self, idx: int) -> dict[str, Any]:
         """Return a single training sample."""

 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
     BaseActionLeRobotDataset,
 )
 from cosmos_framework.data.vfm.action.pose_utils import (
     PoseConvention,
             split_val_ratio=split_val_ratio,
             split=split,
             mode=mode,
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
             )
         return kept
     def __getitem__(self, idx: int) -> dict[str, Any]:
         """Return a single training sample."""

cosmos-framework/cosmos_framework/data/vfm/action/pose_utils.py CHANGED Viewed

@@ -19,7 +19,6 @@ dataset stack:
     canonical public entrypoint for representation conversion.
 """
-import math
 from typing import Literal
 import numpy as np
@@ -540,208 +539,3 @@ def pose_rel_to_abs(
         current_pose = next_pose
     return np.stack(poses_abs)  # [T,4,4]
-# -----------------------------------------------------------------------------
-# Idle-frame detection
-# -----------------------------------------------------------------------------
-def _identity_rotation_vector(rotation_format: RotationConvention) -> np.ndarray:
-    """Return the identity-rotation vector for a given rotation convention.
-    Used by :func:`compute_idle_frames` to test whether a rotation block is
-    close to "no rotation" in its current encoding.
-    """
-    if rotation_format in ("matrix", "rot9d"):
-        return np.array([1, 0, 0, 0, 1, 0, 0, 0, 1], dtype=np.float32)
-    if rotation_format == "rot6d":
-        return np.array([1, 0, 0, 0, 1, 0], dtype=np.float32)
-    if rotation_format == "quat_xyzw":
-        return np.array([0, 0, 0, 1], dtype=np.float32)
-    if rotation_format == "quat_wxyz":
-        return np.array([1, 0, 0, 0], dtype=np.float32)
-    if rotation_format in ("euler_xyz", "axisangle"):
-        return np.array([0, 0, 0], dtype=np.float32)
-    raise ValueError(f"Unsupported rotation_format={rotation_format!r}")
-def _rotation_angle_per_arm(rotations: np.ndarray, rotation_format: str) -> np.ndarray:
-    """Geodesic angle (rad) from identity for each arm at each frame.
-    ``rotations`` has shape ``(T, n_arms, n_per_arm)``; the returned array has
-    shape ``(T, n_arms)``. The angle is rotation-format aware so a fixed
-    ``eps_r`` threshold has consistent geometric meaning across formats:
-    - ``rot6d``  → reconstruct ``trace(R)`` in closed form from the two stored
-      columns ``a, b`` (already unit-orthogonal as they came from a valid
-      rotation matrix). The third column is ``a × b``, so
-      ``trace(R) = a[0] + b[1] + a[0]·b[1] - a[1]·b[0]``.
-      ``angle = arccos(clip((trace - 1) / 2, -1, 1))``.
-    - ``rot9d``  → reshape to ``(..., 3, 3)`` and use
-      ``trace(R) = R[0,0] + R[1,1] + R[2,2]``.
-    - ``quat_xyzw`` / ``quat_wxyz`` → ``angle = 2 · arccos(|q_w|)``; the
-      absolute value handles the double cover (``q`` and ``-q`` represent the
-      same rotation).
-    - ``axisangle`` → the magnitude of the axis-angle vector *is* the angle.
-    - ``euler_xyz`` → no closed-form angle; use ``‖euler‖`` as a conservative
-      upper bound (exact for single-axis rotations, an overestimate for
-      composed ones — fine for idle detection where small angles are the
-      regime of interest).
-    """
-    if rotation_format == "rot6d":
-        a = rotations[..., :3]
-        b = rotations[..., 3:6]
-        trace = a[..., 0] + b[..., 1] + a[..., 0] * b[..., 1] - a[..., 1] * b[..., 0]
-        return np.arccos(np.clip((trace - 1.0) / 2.0, -1.0, 1.0))
-    if rotation_format == "rot9d":
-        mat = rotations.reshape(*rotations.shape[:-1], 3, 3)
-        trace = mat[..., 0, 0] + mat[..., 1, 1] + mat[..., 2, 2]
-        return np.arccos(np.clip((trace - 1.0) / 2.0, -1.0, 1.0))
-    if rotation_format in ("quat_xyzw", "quat_wxyz"):
-        qw = rotations[..., 3] if rotation_format == "quat_xyzw" else rotations[..., 0]
-        return 2.0 * np.arccos(np.clip(np.abs(qw), 0.0, 1.0))
-    if rotation_format == "axisangle":
-        return np.linalg.norm(rotations, axis=-1)
-    if rotation_format == "euler_xyz":
-        # Exact for single-axis rotations, overestimate for composed ones —
-        # safe for idle thresholds since overestimation can only mark a frame
-        # as non-idle, never spuriously idle.
-        return np.linalg.norm(rotations, axis=-1)
-    raise ValueError(f"Unsupported rotation_format={rotation_format!r}")
-def _consecutive_streaks(idle: np.ndarray, min_streak: int) -> np.ndarray:
-    """Zero out idle bits not belonging to a run of ``>= min_streak`` Trues.
-    Pure-numpy two-pointer scan. ``min_streak <= 1`` is a no-op (returns the
-    input mask unchanged).
-    """
-    if min_streak <= 1:
-        return idle
-    out = np.zeros_like(idle)
-    n = len(idle)
-    i = 0
-    while i < n:
-        if not idle[i]:
-            i += 1
-            continue
-        j = i
-        while j < n and idle[j]:
-            j += 1
-        if j - i >= min_streak:
-            out[i:j] = True
-        i = j
-    return out
-def compute_idle_frames(
-    action_raw: torch.Tensor | np.ndarray,
-    spec: "ActionSpec",  # noqa: F821 — forward ref, real import is in action_spec.py
-    *,
-    eps_t: float = 1e-3,
-    eps_r: float = math.radians(5.0),
-    eps_g: float = 1e-2,
-    joint_threshold: float = 5e-4,
-    min_streak: int = 3,
-) -> int:
-    """Count idle frames in a raw (un-normalized) action chunk.
-    Idle detection runs per-DimType (driven by ``spec.types``); a frame is
-    *raw-idle* iff every relevant type group is idle on that frame, and
-    counts toward the final tally only if it belongs to a run of at least
-    ``min_streak`` consecutive raw-idle frames. The streak filter rejects
-    isolated low-motion frames (instantaneous slowdowns) which carry weak
-    physical meaning and add noise to the IdleFrames training signal.
-    DimType branches:
-    - ``POS``      → combined ``‖action[pos_idx]‖`` (L2 across all POS dims)
-      < ``eps_t``. For single-arm specs (3 dims) this is the standard ``‖t‖``
-      check; for multi-arm specs the combined norm is slightly stricter than
-      a per-arm check.
-    - ``ROT``      → per-arm geodesic rotation angle (rad) from identity
-      < ``eps_r``. The angle is computed in a rotation-format aware way (see
-      :func:`_rotation_angle_per_arm`) so the threshold has consistent
-      geometric meaning regardless of the encoding.
-    - ``GRIPPER``  → ``max |action[t] - action[t-1]| < eps_g``. ``np.diff``
-      with ``prepend=action[0]`` makes step 0 ``|0|`` (treated as "no change");
-      with the streak filter this can no longer create a spurious single-frame
-      idle event.
-    - ``JOINT``    → same frame-diff scheme as gripper with
-      ``joint_threshold`` (rad / step).
-    - ``RESERVED`` → ignored.
-    Defaults (in the units of the un-normalized action):
-    - ``eps_t = 1e-3``     → 1 mm per-frame translation
-    - ``eps_r = 5°``       → 5° per-frame rotation (geodesic angle)
-    - ``eps_g = 1e-2``     → 1 % gripper command change
-    - ``joint_threshold = 5e-4`` → ~0.03° / step joint angle change
-    - ``min_streak = 3``   → require a run of >= 3 consecutive idle frames
-    The input must be **un-normalized** so the identity transform sits at
-    known coordinates (translation ≈ 0, rotation ≈ identity). The action
-    vector is also assumed to be encoded in a per-step / framewise convention
-    (e.g. ``backward_framewise``); anchored conventions (``backward_anchored``)
-    accumulate over the chunk and would silently break the POS/ROT idle
-    checks. Callers (e.g. the LeRobot base class) gate on pose convention
-    before calling this function.
-    """
-    if isinstance(action_raw, torch.Tensor):
-        action = action_raw.detach().cpu().numpy().astype(np.float32, copy=False)
-    else:
-        action = np.asarray(action_raw, dtype=np.float32)
-    if action.ndim != 2:
-        raise ValueError(f"action_raw must be 2-D (T, D); got shape {action.shape}")
-    num_frames, action_dim = action.shape
-    if num_frames == 0:
-        return 0
-    if action_dim != len(spec.types):
-        raise ValueError(f"action_dim={action_dim} does not match spec.dim={len(spec.types)}")
-    # Import locally to avoid a circular import at module load time
-    # (action_spec.py imports RotationConvention from this file).
-    from cosmos_framework.data.vfm.action.action_spec import DimType
-    pos_idx = [i for i, t in enumerate(spec.types) if t == DimType.POS]
-    rot_idx = [i for i, t in enumerate(spec.types) if t == DimType.ROT]
-    grip_idx = [i for i, t in enumerate(spec.types) if t == DimType.GRIPPER]
-    joint_idx = [i for i, t in enumerate(spec.types) if t == DimType.JOINT]
-    idle = np.ones(num_frames, dtype=bool)
-    # POS: combined L2 norm across all translation dims.
-    if pos_idx:
-        idle &= np.linalg.norm(action[:, pos_idx], axis=1) < eps_t
-    # ROT: per-arm geodesic angle (rad).
-    if rot_idx:
-        rot_id = _identity_rotation_vector(spec.rotation_format)
-        n_per_arm = rot_id.shape[0]
-        if len(rot_idx) % n_per_arm != 0:
-            raise ValueError(
-                f"ROT dims ({len(rot_idx)}) not a multiple of "
-                f"rotation_format={spec.rotation_format!r} dim ({n_per_arm})"
-            )
-        rotations = action[:, rot_idx].reshape(num_frames, -1, n_per_arm)
-        angles = _rotation_angle_per_arm(rotations, spec.rotation_format)  # (T, n_arms)
-        idle &= angles.max(axis=1) < eps_r
-    # GRIPPER: max |Δgripper| across all gripper dims; step 0's diff is 0.
-    if grip_idx:
-        gripper = action[:, grip_idx]
-        diff = np.abs(np.diff(gripper, axis=0, prepend=gripper[:1]))
-        idle &= diff.max(axis=1) < eps_g
-    # JOINT: same frame-diff scheme with joint_threshold.
-    if joint_idx:
-        joints = action[:, joint_idx]
-        diff = np.abs(np.diff(joints, axis=0, prepend=joints[:1]))
-        idle &= diff.max(axis=1) < joint_threshold
-    if min_streak > 1:
-        idle = _consecutive_streaks(idle, min_streak)
-    return int(idle.sum())

     canonical public entrypoint for representation conversion.
 """
 from typing import Literal
 import numpy as np
         current_pose = next_pose
     return np.stack(poses_abs)  # [T,4,4]

cosmos-framework/cosmos_framework/data/vfm/action/robomind_franka_dataset.py CHANGED Viewed

@@ -16,7 +16,6 @@
 from __future__ import annotations
-import math
 import os
 from typing import Any, cast
@@ -25,12 +24,7 @@ import torch
 import torch.nn.functional as F
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
-    ActionSpec,
     BaseActionLeRobotDataset,
-    Gripper,
-    Pos,
-    Rot,
-    build_action_spec,
 )
 from cosmos_framework.data.vfm.action.pose_utils import (
     PoseConvention,
@@ -77,14 +71,6 @@ class RoboMINDFrankaDataset(BaseActionLeRobotDataset):
     # 1.5°/s) so a single arm doing a slow approach (~1mm/f at 10 Hz) is no
     # longer classified as idle.
     #
-    # Class defaults below match single-arm. Dual-arm overrides at instance
-    # construction (see ``__init__``).
-    _IDLE_EPS_T_SINGLE: float = 22e-3
-    _IDLE_EPS_R_SINGLE: float = math.radians(3.0)
-    _IDLE_EPS_T_DUAL: float = 5e-3  # = base default; tight enough
-    _IDLE_EPS_R_DUAL: float = math.radians(1.5)  # for "single-arm-slow" cases
-    idle_eps_t_per_sec: float = _IDLE_EPS_T_SINGLE
-    idle_eps_r_per_sec: float = _IDLE_EPS_R_SINGLE
     def __init__(
         self,
@@ -113,7 +99,6 @@ class RoboMINDFrankaDataset(BaseActionLeRobotDataset):
             split_val_ratio=split_val_ratio,
             split=split,
             mode=mode,
-            embodiment_type=embodiment_type,
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
@@ -121,15 +106,10 @@ class RoboMINDFrankaDataset(BaseActionLeRobotDataset):
             enable_fast_init=enable_fast_init,
         )
         self._to_opencv: np.ndarray = _ROBOMIND_FRANKA_TO_OPENCV[:3, :3]
         self._is_concat_view: bool = viewpoint == "concat_view"
-        # Per-embodiment idle thresholds (instance-level override of the
-        # class default which matches single-arm). Dual-arm tightens both
-        # eps_t and eps_r to reflect its smaller per-frame motion tail.
-        if embodiment_type == "robomind-franka-dual":
-            self.idle_eps_t_per_sec = self._IDLE_EPS_T_DUAL
-            self.idle_eps_r_per_sec = self._IDLE_EPS_R_DUAL
         embodiment_key = embodiment_type.removeprefix("robomind-")
         lerobot_roots = LEROBOT_ROOTS[embodiment_key]
@@ -220,26 +200,6 @@ class RoboMINDFrankaDataset(BaseActionLeRobotDataset):
         composite = torch.cat([top_or_front, bottom], dim=-2)  # [T,C,3H/2,W]
         return composite  # [T,C,3H/2,W]
-    def _build_action_spec(self) -> ActionSpec:
-        """RoboMIND Franka: 10D single-arm or 20D dual-arm.
-        Single (``robomind-franka``):
-            ``[Pos, Rot6d, Gripper]``  (10D)
-        Dual (``robomind-franka-dual``):
-            ``[L_Pos, L_Rot6d, L_Gripper, R_Pos, R_Rot6d, R_Gripper]``  (20D)
-        """
-        if self._embodiment_type == "robomind-franka":
-            return build_action_spec(Pos(), Rot("rot6d"), Gripper())
-        # dual arm
-        return build_action_spec(
-            Pos(prefix="left"),
-            Rot("rot6d", prefix="left"),
-            Gripper(prefix="left"),
-            Pos(prefix="right"),
-            Rot("rot6d", prefix="right"),
-            Gripper(prefix="right"),
-        )
     def __getitem__(self, idx: int) -> dict[str, Any]:
         mode, _, _, sample = self._fetch_sample(idx)

 from __future__ import annotations
 import os
 from typing import Any, cast
 import torch.nn.functional as F
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
     BaseActionLeRobotDataset,
 )
 from cosmos_framework.data.vfm.action.pose_utils import (
     PoseConvention,
     # 1.5°/s) so a single arm doing a slow approach (~1mm/f at 10 Hz) is no
     # longer classified as idle.
     #
     def __init__(
         self,
             split_val_ratio=split_val_ratio,
             split=split,
             mode=mode,
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
             enable_fast_init=enable_fast_init,
         )
+        self._embodiment_type = embodiment_type
         self._to_opencv: np.ndarray = _ROBOMIND_FRANKA_TO_OPENCV[:3, :3]
         self._is_concat_view: bool = viewpoint == "concat_view"
         embodiment_key = embodiment_type.removeprefix("robomind-")
         lerobot_roots = LEROBOT_ROOTS[embodiment_key]
         composite = torch.cat([top_or_front, bottom], dim=-2)  # [T,C,3H/2,W]
         return composite  # [T,C,3H/2,W]
     def __getitem__(self, idx: int) -> dict[str, Any]:
         mode, _, _, sample = self._fetch_sample(idx)

cosmos-framework/cosmos_framework/data/vfm/action/umi_lerobot_dataset.py CHANGED Viewed

@@ -11,7 +11,6 @@ import numpy as np
 import torch
 from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
-from cosmos_framework.data.vfm.action.action_spec import ActionSpec, Gripper, Pos, Rot, build_action_spec
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import BaseActionLeRobotDataset
 from cosmos_framework.data.vfm.action.pose_utils import PoseConvention, build_abs_pose_from_components, pose_abs_to_rel
 from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
@@ -45,7 +44,6 @@ class UMIFastLeRobotDataset(BaseActionLeRobotDataset):
             split_val_ratio=split_val_ratio,
             split=split,
             mode=mode,
-            embodiment_type="umi",
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
@@ -62,8 +60,6 @@ class UMIFastLeRobotDataset(BaseActionLeRobotDataset):
             _GRIPPER_FEATURE: observation_ts,
         }
-    def _build_action_spec(self) -> ActionSpec:
-        return build_action_spec(Pos(), Rot("rot6d"), Gripper())
     def _register_sources(self, shard_indices: list[int] | None = None) -> None:
         roots = self._all_shard_roots if shard_indices is None else [self._all_shard_roots[i] for i in shard_indices]

 import torch
 from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
 from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import BaseActionLeRobotDataset
 from cosmos_framework.data.vfm.action.pose_utils import PoseConvention, build_abs_pose_from_components, pose_abs_to_rel
 from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
             split_val_ratio=split_val_ratio,
             split=split,
             mode=mode,
             viewpoint=viewpoint,
             pose_convention=pose_convention,
             rotation_format="rot6d",
             _GRIPPER_FEATURE: observation_ts,
         }
     def _register_sources(self, shard_indices: list[int] | None = None) -> None:
         roots = self._all_shard_roots if shard_indices is None else [self._all_shard_roots[i] for i in shard_indices]