XinKongCosmos commited on
Commit
80ca707
·
verified ·
1 Parent(s): e861d93

Clean raw-action viewer release

Browse files
Files changed (24) hide show
  1. cosmos-framework/cosmos_framework/data/vfm/action/action_normalization.py +0 -49
  2. cosmos-framework/cosmos_framework/data/vfm/action/bridge_orig_lerobot_dataset.py +0 -4
  3. cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py +4 -97
  4. cosmos-framework/cosmos_framework/data/vfm/action/domain_utils.py +0 -3
  5. cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset.py +2 -5
  6. cosmos-framework/cosmos_framework/data/vfm/action/fractal.py +0 -4
  7. cosmos-framework/cosmos_framework/data/vfm/action/libero_dataset.py +0 -611
  8. cosmos-framework/cosmos_framework/data/vfm/action/libero_pose_utils.py +0 -69
  9. cosmos-framework/cosmos_framework/data/vfm/action/normalizers/bridge_orig_lerobot_backward_framewise_rot6d.json +0 -33
  10. cosmos-framework/cosmos_framework/data/vfm/action/normalizers/droid_lerobot_backward_framewise_rot6d.json +0 -33
  11. cosmos-framework/cosmos_framework/data/vfm/action/normalizers/fractal_backward_framewise_rot6d.json +0 -33
  12. cosmos-framework/cosmos_framework/data/vfm/action/normalizers/libero_native_frame_wise_relative_rot6d.json +0 -37
  13. cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka-dual_backward_framewise_rot6d.json +0 -33
  14. cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka_backward_framewise_rot6d.json +0 -33
  15. cosmos-framework/cosmos_framework/data/vfm/action/robomind_franka_dataset.py +0 -3
  16. cosmos-framework/cosmos_framework/data/vfm/action/umi_lerobot_dataset.py +1 -3
  17. cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/README.md +12 -28
  18. cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py +0 -6
  19. cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ik_solver.py +2 -2
  20. cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_action.py +68 -326
  21. cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_renderer.py +6 -96
  22. cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ur5e_robotiq_2f85.xml +0 -326
  23. cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/urdf_loader.py +0 -139
  24. cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py +7 -41
cosmos-framework/cosmos_framework/data/vfm/action/action_normalization.py DELETED
@@ -1,49 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: OpenMDW-1.1
3
-
4
- """Action normalization helpers."""
5
-
6
- import json
7
- from pathlib import Path
8
-
9
- import numpy as np
10
- import torch
11
-
12
- from cosmos_framework.utils import log
13
-
14
-
15
- def load_action_stats(stats_path: str, stats_key: str = "global") -> dict[str, np.ndarray]:
16
- """Load pre-computed action normalization stats from a JSON file."""
17
- path = Path(stats_path)
18
- if not path.exists():
19
- raise FileNotFoundError(f"Action normalization stats not found at {stats_path}.")
20
- log.info(f"Loading action normalization stats from {stats_path}")
21
- with path.open("r") as f:
22
- raw = json.load(f)
23
- if stats_key in raw:
24
- raw = raw[stats_key]
25
- if not isinstance(raw, dict):
26
- raise TypeError(f"Action normalization stats block {stats_key!r} in {stats_path} must be a dict.")
27
- elif stats_key != "global":
28
- raise KeyError(f"Action normalization stats block {stats_key!r} not found in {stats_path}.")
29
- stat_keys = {"mean", "std", "min", "max", "q01", "q99"}
30
- return {k: np.array(v, dtype=np.float32) for k, v in raw.items() if k in stat_keys}
31
-
32
-
33
- def normalize_action(
34
- action: torch.Tensor,
35
- method: str,
36
- stats: dict[str, torch.Tensor],
37
- ) -> torch.Tensor:
38
- """Normalize action tensor (all dimensions including gripper)."""
39
- if method == "quantile":
40
- q01, q99 = stats["q01"], stats["q99"]
41
- denom = (q99 - q01).clamp(min=1e-8)
42
- return (2.0 * (action - q01) / denom - 1.0).clamp(-1.0, 1.0)
43
- if method == "meanstd":
44
- return (action - stats["mean"]) / stats["std"].clamp(min=1e-8)
45
- if method == "minmax":
46
- lo, hi = stats["min"], stats["max"]
47
- denom = (hi - lo).clamp(min=1e-8)
48
- return (2.0 * (action - lo) / denom - 1.0).clamp(-1.0, 1.0)
49
- raise ValueError(f"Unknown normalization method: {method!r}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-framework/cosmos_framework/data/vfm/action/bridge_orig_lerobot_dataset.py CHANGED
@@ -18,7 +18,6 @@ from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
18
 
19
  from cosmos_framework.utils import log
20
  from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
21
- ActionNormalization,
22
  ActionSpec,
23
  BaseActionLeRobotDataset,
24
  Gripper,
@@ -101,7 +100,6 @@ class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset):
101
  split: str = "train",
102
  mode: str = "policy",
103
  pose_convention: PoseConvention = "backward_framewise",
104
- action_normalization: ActionNormalization | None = None,
105
  viewpoint: Viewpoint = "ego_view",
106
  enable_fast_init: bool = False,
107
  ) -> None:
@@ -117,7 +115,6 @@ class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset):
117
  viewpoint=viewpoint,
118
  pose_convention=pose_convention,
119
  rotation_format="rot6d",
120
- action_normalization=action_normalization,
121
  tolerance_s=1e-4,
122
  enable_fast_init=enable_fast_init,
123
  )
@@ -208,7 +205,6 @@ class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset):
208
  # ------------------------------------------------------------------
209
  # Normalization is handled by BaseActionLeRobotDataset.
210
  # Stats are loaded from:
211
- # cosmos_framework/data/vfm/action/normalizers/
212
  # bridge_orig_lerobot_<pose_convention>_<rotation_format>.json
213
  # Regenerate via ``compute_action_stats.py`` + ``debug/stats_all.sh``.
214
  # ------------------------------------------------------------------
 
18
 
19
  from cosmos_framework.utils import log
20
  from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
 
21
  ActionSpec,
22
  BaseActionLeRobotDataset,
23
  Gripper,
 
100
  split: str = "train",
101
  mode: str = "policy",
102
  pose_convention: PoseConvention = "backward_framewise",
 
103
  viewpoint: Viewpoint = "ego_view",
104
  enable_fast_init: bool = False,
105
  ) -> None:
 
115
  viewpoint=viewpoint,
116
  pose_convention=pose_convention,
117
  rotation_format="rot6d",
 
118
  tolerance_s=1e-4,
119
  enable_fast_init=enable_fast_init,
120
  )
 
205
  # ------------------------------------------------------------------
206
  # Normalization is handled by BaseActionLeRobotDataset.
207
  # Stats are loaded from:
 
208
  # bridge_orig_lerobot_<pose_convention>_<rotation_format>.json
209
  # Regenerate via ``compute_action_stats.py`` + ``debug/stats_all.sh``.
210
  # ------------------------------------------------------------------
cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py CHANGED
@@ -21,9 +21,8 @@ from bisect import bisect_right
21
  from collections import OrderedDict, defaultdict
22
  from collections.abc import Callable, Sequence
23
  from concurrent.futures import ThreadPoolExecutor
24
- from pathlib import Path
25
  from threading import Lock
26
- from typing import Any, ClassVar, Literal
27
 
28
  import huggingface_hub.constants as _hf_const
29
  import numpy as np
@@ -57,11 +56,6 @@ def _ensure_hf_hub_offline() -> None:
57
  from functools import cached_property
58
 
59
  from cosmos_framework.utils import log
60
- from cosmos_framework.data.vfm.action.action_normalization import (
61
- load_action_stats,
62
- normalize_action,
63
- )
64
-
65
  # Re-export the action_spec DSL from this module so that subclass datasets
66
  # only need a single import block (alongside ``BaseActionLeRobotDataset``).
67
  from cosmos_framework.data.vfm.action.action_spec import ( # noqa: F401 (re-export)
@@ -96,8 +90,6 @@ from cosmos_framework.data.vfm.action_scripts.memprofile import (
96
  # ---------------------------------------------------------------------------
97
  _LRU_VIDEO_CACHE_MAX_SIZE: int = 64
98
  _LRU_DATASET_MAX_LOADED: int = 32
99
- ActionNormalization = Literal["quantile", "quantile_rot", "meanstd", "minmax"]
100
- _ACTION_NORMALIZATION_CHOICES: tuple[str, ...] = ("quantile", "quantile_rot", "meanstd", "minmax")
101
 
102
  _decoder_cache_patched = False
103
 
@@ -290,14 +282,6 @@ class BaseActionLeRobotDataset(Dataset):
290
  # Applied as: R_opencv = R_native @ _to_opencv
291
  # Subclasses override in __init__; default is identity (no correction).
292
 
293
- # Bundled normalization stats directory. Stats are committed at
294
- # ``<_NORMALIZERS_DIR>/<embodiment>_<pose>_<rotation_format>.json`` (flat
295
- # layout matching the existing UMI files) and produced by
296
- # ``projects/cosmos3/vfm/datasets/action/compute_action_stats.py``.
297
- # Subclasses that need a different filename scheme can override
298
- # :meth:`_normalizer_filename`.
299
- _NORMALIZERS_DIR: ClassVar[Path] = Path(__file__).parent / "normalizers"
300
-
301
  def __init__(
302
  self,
303
  *,
@@ -311,7 +295,6 @@ class BaseActionLeRobotDataset(Dataset):
311
  viewpoint: Viewpoint,
312
  pose_convention: str | None = None,
313
  rotation_format: str | None = None,
314
- action_normalization: ActionNormalization | None = None,
315
  tolerance_s: float = 1e-4,
316
  max_loaded_datasets: int = _LRU_DATASET_MAX_LOADED,
317
  skip_video_loading: bool = False,
@@ -326,10 +309,6 @@ class BaseActionLeRobotDataset(Dataset):
326
 
327
  assert sample_stride >= 1, f"sample_stride must be >= 1, got {sample_stride}"
328
  assert fast_init_max_workers >= 1, f"fast_init_max_workers must be >= 1, got {fast_init_max_workers}"
329
- assert action_normalization is None or action_normalization in _ACTION_NORMALIZATION_CHOICES, (
330
- f"action_normalization must be None or one of {_ACTION_NORMALIZATION_CHOICES}, got {action_normalization!r}"
331
- )
332
-
333
  with rss_tracker(f"{self.__class__.__name__}.__init__", enabled=self._memprofile):
334
  self._fps = fps
335
  self._dt = 1.0 / fps
@@ -342,10 +321,6 @@ class BaseActionLeRobotDataset(Dataset):
342
  self._viewpoint: Viewpoint = viewpoint
343
  self._pose_convention = pose_convention
344
  self._rotation_format = rotation_format
345
- self._action_normalization = action_normalization
346
- # Lazy-loaded stats cache, populated on first call to
347
- # :meth:`_normalize_action`. Per-process (workers get their own).
348
- self._norm_stats: dict[str, torch.Tensor] | None = None
349
  self._tolerance_s = tolerance_s
350
  self._max_loaded_datasets = max_loaded_datasets
351
  self._skip_video_loading = skip_video_loading
@@ -728,74 +703,6 @@ class BaseActionLeRobotDataset(Dataset):
728
 
729
  return mode, dataset_idx, row_idx, sample
730
 
731
- # -- action normalization ------------------------------------------------
732
-
733
- def _normalizer_filename(self) -> str:
734
- """Bundled stats filename for this dataset instance.
735
-
736
- Default convention (matches ``compute_action_stats.py`` output):
737
- ``<embodiment_type>[_<pose_convention>][_<rotation_format>].json``.
738
-
739
- Pose/rotation suffixes are appended only when the instance actually
740
- has them (SE(3) pose datasets like Bridge / DROID). Joint-space
741
- datasets — where both are ``None`` — resolve to just
742
- ``<embodiment_type>.json``.
743
-
744
- Subclasses may override when the bundled filename uses a different
745
- scheme (e.g. UMI's ``uva_umi_single_task_normalizer.json``).
746
- """
747
- if not self._embodiment_type:
748
- raise RuntimeError(
749
- f"{self.__class__.__name__}: embodiment_type is not set; cannot resolve normalizer filename."
750
- )
751
- parts = [self._embodiment_type]
752
- if self._pose_convention:
753
- parts.append(self._pose_convention)
754
- if self._rotation_format:
755
- parts.append(self._rotation_format)
756
- return "_".join(parts) + ".json"
757
-
758
- def _normalizer_path(self) -> Path:
759
- """Full path to the bundled stats JSON for this dataset."""
760
- return self._NORMALIZERS_DIR / self._normalizer_filename()
761
-
762
- def _load_norm_stats(self) -> dict[str, torch.Tensor]:
763
- """Lazy-load action normalization stats (once per worker process).
764
-
765
- Raises :class:`FileNotFoundError` if the stats file is missing. This
766
- is intentional — silently falling back to identity normalization when
767
- the user asked for ``quantile`` / ``quantile_rot`` / ``meanstd`` /
768
- ``minmax`` would be a training bug.
769
- """
770
- if self._norm_stats is not None:
771
- return self._norm_stats
772
- stats_key = "global_raw" if self._action_normalization == "quantile_rot" else "global"
773
- raw = load_action_stats(str(self._normalizer_path()), stats_key=stats_key)
774
- self._norm_stats = {}
775
- for key, value in raw.items():
776
- self._norm_stats[key] = torch.from_numpy(value).float() # [D]
777
- return self._norm_stats
778
-
779
- def _normalize_action(self, action: torch.Tensor) -> torch.Tensor:
780
- """Apply the configured normalization, or return the raw action.
781
-
782
- - ``action_normalization=None`` → pass-through (used by viewer / debug)
783
- - ``"quantile"`` → ``2·(x − q01) / (q99 − q01) − 1`` clamped to [-1, 1]
784
- - ``"quantile_rot"`` → same as ``"quantile"``, but using ``global_raw``
785
- stats so rotation dimensions are normalized too.
786
- - ``"meanstd"`` → ``(x − mean) / std``
787
- - ``"minmax"`` → ``2·(x − min) / (max − min) − 1`` clamped to [-1, 1]
788
- """
789
- if self._action_normalization is None:
790
- return action
791
- method = "quantile" if self._action_normalization == "quantile_rot" else self._action_normalization
792
- normalized_action = normalize_action(
793
- action,
794
- method,
795
- self._load_norm_stats(),
796
- ) # [T,D]
797
- return normalized_action
798
-
799
  # -- video formatting ----------------------------------------------------
800
 
801
  def _convert_video(self, video_tchw: torch.Tensor | None) -> torch.Tensor | None:
@@ -989,9 +896,9 @@ class BaseActionLeRobotDataset(Dataset):
989
  if idle_frames is not None:
990
  extras = {"idle_frames": idle_frames, **extras}
991
 
992
- normalized_action = self._normalize_action(action) # [T,D]
993
  if self._skip_video_loading:
994
- result: dict[str, Any] = {"action": normalized_action}
995
  if "idle_frames" in extras:
996
  result["idle_frames"] = extras["idle_frames"]
997
  return result
@@ -999,7 +906,7 @@ class BaseActionLeRobotDataset(Dataset):
999
  return {
1000
  "ai_caption": ai_caption,
1001
  "video": formatted_video,
1002
- "action": normalized_action,
1003
  "conditioning_fps": torch.tensor(self._fps, dtype=torch.long),
1004
  "mode": mode,
1005
  "domain_id": torch.tensor(self._domain_id, dtype=torch.long),
 
21
  from collections import OrderedDict, defaultdict
22
  from collections.abc import Callable, Sequence
23
  from concurrent.futures import ThreadPoolExecutor
 
24
  from threading import Lock
25
+ from typing import Any, Literal
26
 
27
  import huggingface_hub.constants as _hf_const
28
  import numpy as np
 
56
  from functools import cached_property
57
 
58
  from cosmos_framework.utils import log
 
 
 
 
 
59
  # Re-export the action_spec DSL from this module so that subclass datasets
60
  # only need a single import block (alongside ``BaseActionLeRobotDataset``).
61
  from cosmos_framework.data.vfm.action.action_spec import ( # noqa: F401 (re-export)
 
90
  # ---------------------------------------------------------------------------
91
  _LRU_VIDEO_CACHE_MAX_SIZE: int = 64
92
  _LRU_DATASET_MAX_LOADED: int = 32
 
 
93
 
94
  _decoder_cache_patched = False
95
 
 
282
  # Applied as: R_opencv = R_native @ _to_opencv
283
  # Subclasses override in __init__; default is identity (no correction).
284
 
 
 
 
 
 
 
 
 
285
  def __init__(
286
  self,
287
  *,
 
295
  viewpoint: Viewpoint,
296
  pose_convention: str | None = None,
297
  rotation_format: str | None = None,
 
298
  tolerance_s: float = 1e-4,
299
  max_loaded_datasets: int = _LRU_DATASET_MAX_LOADED,
300
  skip_video_loading: bool = False,
 
309
 
310
  assert sample_stride >= 1, f"sample_stride must be >= 1, got {sample_stride}"
311
  assert fast_init_max_workers >= 1, f"fast_init_max_workers must be >= 1, got {fast_init_max_workers}"
 
 
 
 
312
  with rss_tracker(f"{self.__class__.__name__}.__init__", enabled=self._memprofile):
313
  self._fps = fps
314
  self._dt = 1.0 / fps
 
321
  self._viewpoint: Viewpoint = viewpoint
322
  self._pose_convention = pose_convention
323
  self._rotation_format = rotation_format
 
 
 
 
324
  self._tolerance_s = tolerance_s
325
  self._max_loaded_datasets = max_loaded_datasets
326
  self._skip_video_loading = skip_video_loading
 
703
 
704
  return mode, dataset_idx, row_idx, sample
705
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
706
  # -- video formatting ----------------------------------------------------
707
 
708
  def _convert_video(self, video_tchw: torch.Tensor | None) -> torch.Tensor | None:
 
896
  if idle_frames is not None:
897
  extras = {"idle_frames": idle_frames, **extras}
898
 
899
+ raw_action = action # [T,D]
900
  if self._skip_video_loading:
901
+ result: dict[str, Any] = {"action": raw_action}
902
  if "idle_frames" in extras:
903
  result["idle_frames"] = extras["idle_frames"]
904
  return result
 
906
  return {
907
  "ai_caption": ai_caption,
908
  "video": formatted_video,
909
+ "action": raw_action,
910
  "conditioning_fps": torch.tensor(self._fps, dtype=torch.long),
911
  "mode": mode,
912
  "domain_id": torch.tensor(self._domain_id, dtype=torch.long),
cosmos-framework/cosmos_framework/data/vfm/action/domain_utils.py CHANGED
@@ -7,16 +7,13 @@ EMBODIMENT_TO_DOMAIN_ID: dict[str, int] = {
7
  "no_action": 0,
8
  "av": 1,
9
  "camera_pose": 2,
10
- "hand_pose": 3,
11
  "pusht": 4,
12
- "libero": 5,
13
  "umi": 6,
14
  "bridge_orig_lerobot": 7,
15
  "droid_lerobot": 8,
16
  "robomind-franka": 8, # Both Droid and RoboMIND-Franka are using robotiq and franka
17
  "embodiment_b": 9,
18
  "robomind-franka-dual": 12,
19
- "robomind-ur": 13,
20
  "fractal": 20,
21
  }
22
 
 
7
  "no_action": 0,
8
  "av": 1,
9
  "camera_pose": 2,
 
10
  "pusht": 4,
 
11
  "umi": 6,
12
  "bridge_orig_lerobot": 7,
13
  "droid_lerobot": 8,
14
  "robomind-franka": 8, # Both Droid and RoboMIND-Franka are using robotiq and franka
15
  "embodiment_b": 9,
16
  "robomind-franka-dual": 12,
 
17
  "fractal": 20,
18
  }
19
 
cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset.py CHANGED
@@ -11,7 +11,6 @@ from scipy.spatial.transform import Rotation as R
11
 
12
  from cosmos_framework.utils import log
13
  from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
14
- ActionNormalization,
15
  ActionSpec,
16
  BaseActionLeRobotDataset,
17
  Gripper,
@@ -69,7 +68,6 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
69
  split: str = "train",
70
  mode: str = "policy",
71
  pose_convention: PoseConvention = "backward_framewise",
72
- action_normalization: ActionNormalization | None = None,
73
  tolerance_s=2e-4,
74
  viewpoint: Viewpoint = "concat_view",
75
  use_success_only: bool = False,
@@ -93,7 +91,6 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
93
  viewpoint=viewpoint,
94
  pose_convention=pose_convention,
95
  rotation_format="rot6d",
96
- action_normalization=action_normalization,
97
  tolerance_s=tolerance_s,
98
  enable_fast_init=enable_fast_init,
99
  )
@@ -398,7 +395,7 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
398
  axis=-1,
399
  )
400
  ).float()
401
- extras["history_action"] = self._normalize_action(hist_action_raw)
402
  if self._use_state:
403
  initial_gripper = sample[_GRIPPER_STATE_FEATURE][0].unsqueeze(-1)
404
  if self._is_gripper_action_flipped:
@@ -449,7 +446,7 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
449
  if self._is_gripper_action_flipped:
450
  hist_gripper = 1.0 - hist_gripper
451
  hist_action_raw = torch.cat((hist_joint, hist_gripper), dim=-1).float()
452
- extras["history_action"] = self._normalize_action(hist_action_raw)
453
  if self._use_state:
454
  initial_gripper = sample[_GRIPPER_STATE_FEATURE][-self._chunk_length - 1].unsqueeze(-1)
455
  if self._is_gripper_action_flipped:
 
11
 
12
  from cosmos_framework.utils import log
13
  from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
 
14
  ActionSpec,
15
  BaseActionLeRobotDataset,
16
  Gripper,
 
68
  split: str = "train",
69
  mode: str = "policy",
70
  pose_convention: PoseConvention = "backward_framewise",
 
71
  tolerance_s=2e-4,
72
  viewpoint: Viewpoint = "concat_view",
73
  use_success_only: bool = False,
 
91
  viewpoint=viewpoint,
92
  pose_convention=pose_convention,
93
  rotation_format="rot6d",
 
94
  tolerance_s=tolerance_s,
95
  enable_fast_init=enable_fast_init,
96
  )
 
395
  axis=-1,
396
  )
397
  ).float()
398
+ extras["history_action"] = hist_action_raw
399
  if self._use_state:
400
  initial_gripper = sample[_GRIPPER_STATE_FEATURE][0].unsqueeze(-1)
401
  if self._is_gripper_action_flipped:
 
446
  if self._is_gripper_action_flipped:
447
  hist_gripper = 1.0 - hist_gripper
448
  hist_action_raw = torch.cat((hist_joint, hist_gripper), dim=-1).float()
449
+ extras["history_action"] = hist_action_raw
450
  if self._use_state:
451
  initial_gripper = sample[_GRIPPER_STATE_FEATURE][-self._chunk_length - 1].unsqueeze(-1)
452
  if self._is_gripper_action_flipped:
cosmos-framework/cosmos_framework/data/vfm/action/fractal.py CHANGED
@@ -15,7 +15,6 @@ from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
15
 
16
  from cosmos_framework.utils import log
17
  from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
18
- ActionNormalization,
19
  ActionSpec,
20
  BaseActionLeRobotDataset,
21
  Gripper,
@@ -84,7 +83,6 @@ class FractalLeRobotDataset(BaseActionLeRobotDataset):
84
  split: str = "train",
85
  mode: str = "policy",
86
  pose_convention: PoseConvention = "backward_framewise",
87
- action_normalization: ActionNormalization | None = None,
88
  viewpoint: Viewpoint = "ego_view",
89
  enable_fast_init: bool = False,
90
  ) -> None:
@@ -103,7 +101,6 @@ class FractalLeRobotDataset(BaseActionLeRobotDataset):
103
  actions. Supports ``"backward_framewise"`` and
104
  ``"backward_anchored"``. Set to ``None`` to disable action
105
  construction outside image-to-video mode.
106
- action_normalization: Optional bundled-stats normalization
107
  (``"quantile"`` / ``"quantile_rot"`` / ``"meanstd"`` / ``"minmax"``);
108
  ``None`` returns raw actions.
109
  viewpoint: Camera viewpoint type for this dataset.
@@ -119,7 +116,6 @@ class FractalLeRobotDataset(BaseActionLeRobotDataset):
119
  viewpoint=viewpoint,
120
  pose_convention=pose_convention,
121
  rotation_format="rot6d",
122
- action_normalization=action_normalization,
123
  tolerance_s=1e-4,
124
  enable_fast_init=enable_fast_init,
125
  )
 
15
 
16
  from cosmos_framework.utils import log
17
  from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
 
18
  ActionSpec,
19
  BaseActionLeRobotDataset,
20
  Gripper,
 
83
  split: str = "train",
84
  mode: str = "policy",
85
  pose_convention: PoseConvention = "backward_framewise",
 
86
  viewpoint: Viewpoint = "ego_view",
87
  enable_fast_init: bool = False,
88
  ) -> None:
 
101
  actions. Supports ``"backward_framewise"`` and
102
  ``"backward_anchored"``. Set to ``None`` to disable action
103
  construction outside image-to-video mode.
 
104
  (``"quantile"`` / ``"quantile_rot"`` / ``"meanstd"`` / ``"minmax"``);
105
  ``None`` returns raw actions.
106
  viewpoint: Camera viewpoint type for this dataset.
 
116
  viewpoint=viewpoint,
117
  pose_convention=pose_convention,
118
  rotation_format="rot6d",
 
119
  tolerance_s=1e-4,
120
  enable_fast_init=enable_fast_init,
121
  )
cosmos-framework/cosmos_framework/data/vfm/action/libero_dataset.py DELETED
@@ -1,611 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: OpenMDW-1.1
3
-
4
- """LIBERO dataset for training from local storage, supporting multiple dataset roots."""
5
-
6
- import random
7
- from pathlib import Path
8
- from typing import Literal
9
-
10
- import torch
11
- import torchvision.transforms.functional as F
12
- from lerobot.datasets.lerobot_dataset import LeRobotDataset
13
- from torch.utils.data import Dataset
14
-
15
- from cosmos_framework.utils import log
16
- from cosmos_framework.data.vfm.action.action_normalization import (
17
- load_action_stats,
18
- normalize_action,
19
- )
20
- from cosmos_framework.data.vfm.action.action_spec import (
21
- Gripper,
22
- Pos,
23
- Rot,
24
- build_action_spec,
25
- )
26
- from cosmos_framework.data.vfm.action.domain_utils import get_domain_id
27
- from cosmos_framework.data.vfm.action.libero_pose_utils import (
28
- libero_action_dim,
29
- libero_rotation_format,
30
- )
31
- from cosmos_framework.data.vfm.action.pose_utils import (
32
- compute_idle_frames,
33
- convert_rotation,
34
- )
35
-
36
- LIBERO_ROOTS: list[str] = [
37
- "<PATH_TO_LIBERO_10>",
38
- "<PATH_TO_LIBERO_90>",
39
- "<PATH_TO_LIBERO_OBJECT>",
40
- "<PATH_TO_LIBERO_SPATIAL>",
41
- "<PATH_TO_LIBERO_GOAL>",
42
- ]
43
-
44
-
45
- class LIBERODataset(Dataset):
46
- """
47
- A Dataset wrapper for LeRobot LIBERO dataset(s) designed for training from local storage.
48
-
49
- This dataset:
50
- - Loads data from local storage using LeRobotDataset
51
- - Supports multiple dataset roots that are concatenated into one dataset
52
- - Supports configurable camera modes (image, wrist_image, or concat_view)
53
- - Filters episodes for train/val split
54
- - Filters frames at episode boundaries (to avoid padding issues with delta timestamps)
55
- - Uses task descriptions from meta/tasks.parquet for ai_caption
56
- """
57
-
58
- _NORMALIZERS_DIR = Path(__file__).parent / "normalizers"
59
-
60
- def __init__(
61
- self,
62
- repo_id: str | list[str] = "lerobot/libero_90",
63
- root: str | list[str] | None = LIBERO_ROOTS,
64
- image_size: int = 256,
65
- chunk_length: int = 16, # must be divisible by 4
66
- fps: int = 10, # IMPORTANT! LIBERO is at 20fps. If using frame_wise_relative in policy mode, we have to match the fps.
67
- mode: str = "policy",
68
- video_backend: str | None = "torchcodec",
69
- download_videos: bool = False,
70
- force_cache_sync: bool = False,
71
- tolerance_s: float = 1e-4,
72
- split: str = "train",
73
- val_ratio: float = 0.01,
74
- seed: int = 0,
75
- # Camera configuration
76
- camera_mode: str = "image", # 'image', 'wrist_image', or 'concat_view'
77
- # Action configuration
78
- action_space: str = "frame_wise_relative", # "absolute" or "relative" or "frame_wise_relative"
79
- # rotation_space
80
- rotation_space: Literal["9d", "6d", "3d"] = "3d",
81
- # Native simulator frame or shared OpenCV-style EE frame used by midtraining.
82
- pose_coordinate_frame: Literal["native", "opencv"] = "native",
83
- # domain-aware configuration
84
- embodiment_type: str = "libero",
85
- action_normalization: Literal["quantile", "quantile_rot", "meanstd", "minmax"] | None = None,
86
- action_stats_path: str | None = None,
87
- skip_video_loading: bool = False,
88
- ):
89
- super().__init__()
90
- self._embodiment_type = embodiment_type
91
- self.domain_id = get_domain_id(embodiment_type)
92
- self.image_size = image_size
93
- self.chunk_length = chunk_length
94
- assert self.chunk_length % 4 == 0, "chunk_length must be divisible by 4"
95
- self.fps = fps
96
- self.mode = mode
97
- self.split = split.lower().strip()
98
- self.val_ratio = val_ratio
99
- self.seed = seed
100
- self.camera_mode = camera_mode.lower().strip()
101
- self.action_space = action_space
102
- self.action_normalization = action_normalization
103
- self.rotation_space = rotation_space.lower().strip()
104
- self.pose_coordinate_frame = pose_coordinate_frame
105
- self._pose_convention = self.action_space
106
- self._rotation_format = libero_rotation_format(self.rotation_space)
107
- # When True, skip video decoding entirely: drop image keys from
108
- # delta_timestamps so LeRobot never touches the mp4, and return
109
- # ``video=None`` in __getitem__. Must be set at construction time
110
- # because LeRobotDataset is eagerly built in __init__.
111
- self._skip_video_loading = bool(skip_video_loading)
112
-
113
- # Load action normalization stats. ``action_min`` / ``action_range`` are
114
- # retained for older LIBERO eval code that knows how to invert a
115
- # range-style [-1, 1] normalization.
116
- self._norm_stats: dict[str, torch.Tensor] | None = None
117
- self.action_min: torch.Tensor | None = None
118
- self.action_max: torch.Tensor | None = None
119
- self.action_range: torch.Tensor | None = None
120
- if self.action_normalization is not None:
121
- stats_path = self._resolve_action_stats_path(action_stats_path)
122
- stats_key = "global_raw" if self.action_normalization == "quantile_rot" else "global"
123
- raw_stats = load_action_stats(str(stats_path), stats_key=stats_key)
124
- self._norm_stats = {}
125
- for key, value in raw_stats.items():
126
- self._norm_stats[key] = torch.from_numpy(value).float() # [D]
127
- self._set_range_denormalization_stats()
128
- log.info(
129
- f"Loaded LIBERO action stats from {stats_path} with action_normalization={self.action_normalization}"
130
- )
131
-
132
- # Validate camera mode
133
- if self.camera_mode not in {"image", "wrist_image", "concat_view"}:
134
- raise ValueError(f"Unsupported camera_mode={camera_mode!r}. Use 'image', 'wrist_image', or 'concat_view'.")
135
-
136
- # Validate split
137
- if self.split not in {"train", "val", "valid", "validation", "eval", "test", "full"}:
138
- raise ValueError(f"Unsupported {split=}. Use train/val/full.")
139
-
140
- # Build delta timestamps based on camera mode
141
- dt = 1.0 / self.fps
142
-
143
- if self.fps != 20:
144
- log.warning(
145
- f"LIBERO is at 20fps. If using frame_wise_relative for policy mode training, we have to match the fps. fps={self.fps}"
146
- )
147
-
148
- # Determine which image keys to use
149
- if self.camera_mode == "image":
150
- self.image_keys = ["observation.images.image"]
151
- elif self.camera_mode == "wrist_image":
152
- self.image_keys = ["observation.images.wrist_image"]
153
- else: # concat_view
154
- self.image_keys = ["observation.images.image", "observation.images.wrist_image"]
155
-
156
- # Build delta_timestamps for all keys (same convention as PushT: 0 to chunk_length)
157
- self.delta_timestamps: dict[str, list[float]] = {}
158
- if not self._skip_video_loading:
159
- for key in self.image_keys:
160
- self.delta_timestamps[key] = [i * dt for i in range(0, chunk_length + 1)]
161
- self.delta_timestamps["observation.state"] = [i * dt for i in range(0, chunk_length + 1)]
162
- self.delta_timestamps["action"] = [i * dt for i in range(0, chunk_length + 1)]
163
-
164
- # Normalize repo_id and root to lists
165
- repo_id_list: list[str] = [repo_id] if isinstance(repo_id, str) else list(repo_id)
166
- root_list: list[str | None]
167
- if root is None:
168
- root_list = [None for _ in repo_id_list]
169
- elif isinstance(root, str):
170
- root_list = [root]
171
- else:
172
- root_list = [r for r in root]
173
-
174
- if len(repo_id_list) != len(root_list):
175
- raise ValueError(
176
- f"Length mismatch: repo_id has {len(repo_id_list)} items, root has {len(root_list)} items."
177
- )
178
-
179
- # Load all datasets
180
- self.datasets: list[LeRobotDataset] = []
181
- self.tasks_dfs: list = [] # Store tasks DataFrames for each dataset
182
- for rid, r in zip(repo_id_list, root_list):
183
- dataset = LeRobotDataset(
184
- repo_id=rid,
185
- root=r,
186
- delta_timestamps=self.delta_timestamps, # type: ignore
187
- tolerance_s=tolerance_s,
188
- force_cache_sync=force_cache_sync,
189
- download_videos=download_videos,
190
- video_backend=video_backend,
191
- episodes=None, # Load full dataset, filter later
192
- )
193
- self.datasets.append(dataset)
194
- self.tasks_dfs.append(dataset.meta.tasks)
195
-
196
- # Build index mapping: list of (dataset_idx, local_idx) for valid frames
197
- self.index_map: list[tuple[int, int, int]] = [] # (dataset_idx, local_idx, episode_idx)
198
- self._episode_boundaries: list[dict[int, tuple[int, int]]] = []
199
- self._episode_splits: list[tuple[set[int], set[int]]] = []
200
-
201
- total_episodes = 0
202
- total_frames = 0
203
- for ds_idx, dataset in enumerate(self.datasets):
204
- # Compute episode splits for this dataset
205
- train_eps, val_eps = self._compute_episode_splits_for_dataset(dataset)
206
- self._episode_splits.append((train_eps, val_eps))
207
-
208
- # Get episodes for current split
209
- split_episodes = self._get_split_episodes_for_dataset(ds_idx)
210
-
211
- # Build episode boundaries
212
- boundaries = self._build_episode_boundaries_for_dataset(dataset)
213
- self._episode_boundaries.append(boundaries)
214
-
215
- # Filter indices
216
- indices = self._filter_indices_for_dataset(ds_idx, dataset, split_episodes, boundaries)
217
- self.index_map.extend(indices)
218
-
219
- total_episodes += dataset.num_episodes
220
- total_frames += len(dataset)
221
-
222
- log.info(
223
- f"Loaded LIBERO dataset with {len(repo_id_list)} source(s) split={self.split!r} "
224
- f"camera_mode={self.camera_mode!r} "
225
- f"total_episodes={total_episodes} "
226
- f"total_frames={total_frames} "
227
- f"valid_indices={len(self.index_map)}"
228
- )
229
-
230
- def _compute_episode_splits_for_dataset(self, dataset: LeRobotDataset) -> tuple[set[int], set[int]]:
231
- """Compute train/val episode splits deterministically for a single dataset."""
232
- total_episodes = int(dataset.meta.total_episodes)
233
-
234
- if not (0.0 < self.val_ratio < 1.0):
235
- raise ValueError(f"{self.val_ratio=} must be in (0, 1).")
236
-
237
- n_val = max(1, int(round(total_episodes * self.val_ratio)))
238
- # val_eps = set(range(n_val))
239
- # train_eps = set(range(n_val, total_episodes))
240
-
241
- # Yihuai: Randomly select validation episodes instead of the first n_val episodes (otherwise task will be repeated)
242
- rng = random.Random(self.seed) # To ensure validation episodes are the same on all ranks
243
- val_eps = set(rng.sample(range(total_episodes), n_val))
244
- train_eps = set(range(total_episodes)) - val_eps
245
-
246
- log.info(f"train_eps={train_eps}, val_eps={val_eps}")
247
-
248
- return train_eps, val_eps
249
-
250
- def _get_split_episodes_for_dataset(self, ds_idx: int) -> set[int]:
251
- """Get the episode set for the current split for a specific dataset."""
252
- train_eps, val_eps = self._episode_splits[ds_idx]
253
- if self.split in {"val", "valid", "validation", "eval", "test"}:
254
- return val_eps
255
- elif self.split == "train":
256
- return train_eps
257
- else: # full
258
- return train_eps | val_eps
259
-
260
- def _build_episode_boundaries_for_dataset(self, dataset: LeRobotDataset) -> dict[int, tuple[int, int]]:
261
- """Build a dict of episode_index -> (start_frame, end_frame) for a single dataset."""
262
- boundaries: dict[int, tuple[int, int]] = {}
263
- for ep in dataset.meta.episodes:
264
- ep_idx = int(ep["episode_index"]) # type: ignore[index]
265
- start = int(ep["dataset_from_index"]) # type: ignore[index]
266
- end = int(ep["dataset_to_index"]) # type: ignore[index]
267
- boundaries[ep_idx] = (start, end)
268
- return boundaries
269
-
270
- def _filter_indices_for_dataset(
271
- self,
272
- ds_idx: int,
273
- dataset: LeRobotDataset,
274
- split_episodes: set[int],
275
- boundaries: dict[int, tuple[int, int]],
276
- ) -> list[tuple[int, int, int]]:
277
- """Filter valid indices for a single dataset, returning (dataset_idx, local_idx, episode_idx)."""
278
- index_map: list[tuple[int, int, int]] = []
279
- all_meta = list(dataset.meta.episodes)
280
-
281
- for ep_idx in split_episodes:
282
- if ep_idx >= len(all_meta):
283
- continue
284
- ep = all_meta[ep_idx]
285
-
286
- ep_start = int(ep["dataset_from_index"]) # type: ignore[index]
287
- ep_end = int(ep["dataset_to_index"]) # type: ignore[index]
288
-
289
- # Valid range: [start, end - chunk_length - 1] inclusive
290
- # We drop chunk_length frames at end to ensure we can query up to delta=chunk_length.
291
- start = ep_start
292
- end = ep_end - self.chunk_length - 1
293
-
294
- if end >= start:
295
- for local_idx in range(start, end + 1):
296
- index_map.append((ds_idx, local_idx, ep_idx))
297
-
298
- return index_map
299
-
300
- def __len__(self) -> int:
301
- return len(self.index_map)
302
-
303
- def _get_task_description(self, ds_idx: int, item: dict) -> str:
304
- """Get task description for the current item from meta/tasks.parquet.
305
-
306
- The tasks.parquet has task descriptions as the DataFrame index (row labels)
307
- and task_index as an integer column. We look up by task_index and return
308
- the corresponding index name (the actual task description string).
309
- """
310
- task_idx = item.get("task_index")
311
- if task_idx is not None:
312
- if isinstance(task_idx, torch.Tensor):
313
- task_idx = task_idx.item()
314
- task_idx = int(task_idx)
315
- tasks_df = self.tasks_dfs[ds_idx]
316
- if task_idx in tasks_df["task_index"].values:
317
- row = tasks_df[tasks_df["task_index"] == task_idx].iloc[0]
318
- # The task description is the index name (row label), not a column value
319
- return str(row.name)
320
- raise ValueError(f"Task index {task_idx} not found in tasks.parquet for dataset {ds_idx}")
321
-
322
- def _compute_anchored_actions(
323
- self,
324
- state_raw: torch.Tensor,
325
- action_raw: torch.Tensor,
326
- ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
327
- """Compute anchored relative actions (batched).
328
-
329
- Converts frame-wise relative actions to anchored relative actions where each
330
- action[t] represents the target pose (after applying action[t] to state[t])
331
- expressed in state 0's local coordinate frame.
332
-
333
- Mathematical formulation:
334
- 1. Compute target in world frame (LIBERO convention):
335
- - p_{t+1} = p_t + delta_p[t] (position addition in world frame)
336
- - R_{t+1} = R_delta[t] @ R_t (rotation composition, delta first)
337
- 2. Compute anchored (left-multiply by T_0^{-1}):
338
- - anchored_pos[t] = R_0^T @ (p_{t+1} - p_0)
339
- - anchored_rot[t] = R_0^T @ R_{t+1}
340
-
341
- Args:
342
- state_raw: State tensor of shape (T+1, 8): [x, y, z, ax, ay, az, grip1, grip2]
343
- where (ax, ay, az) is axis-angle rotation.
344
- action_raw: Action tensor of shape (T+1, 7): [dx, dy, dz, dax, day, daz, grip]
345
- where (dax, day, daz) is axis-angle rotation delta.
346
-
347
- Returns:
348
- anchored_translation: (T, 3) - position in state_0's local frame
349
- anchored_rotation_9d: (T, 9) - rotation relative to state_0 as flattened 3x3 matrix
350
- gripper: (T, 1) - original gripper commands (unchanged)
351
- """
352
- # Extract positions and rotations from states
353
- p_states = state_raw[:, :3] # [T+1,3]
354
- rotvec_states = state_raw[:, 3:6] # [T+1,3] - axis-angle
355
-
356
- # Extract deltas from actions (use first T actions)
357
- delta_p = action_raw[:-1, :3] # [T,3]
358
- delta_rotvec = action_raw[:-1, 3:6] # [T,3] - axis-angle delta
359
- gripper = action_raw[:-1, 6:7] # [T,1]
360
-
361
- # Convert all axis-angle to rotation matrices (batched)
362
- R_states = convert_rotation(rotvec_states, input_format="axisangle", output_format="matrix") # [T+1,3,3]
363
- R_deltas = convert_rotation(delta_rotvec, input_format="axisangle", output_format="matrix") # [T,3,3]
364
-
365
- # Initial pose (state 0)
366
- p_0 = p_states[0] # [3]
367
- R_0 = R_states[0] # [3,3]
368
- R_0_T = R_0.T # [3,3] - transpose for inverse rotation
369
-
370
- # Current states for t = 0..T-1
371
- p_t = p_states[:-1] # [T,3]
372
- R_t = R_states[:-1] # [T,3,3]
373
-
374
- # Step 1: Compute target poses in world frame (LIBERO convention)
375
- # p_target = p_t + delta_p
376
- p_target = p_t + delta_p # [T,3]
377
-
378
- # R_target = R_delta @ R_t (batched matrix multiply)
379
- R_target = torch.bmm(R_deltas, R_t) # [T,3,3]
380
-
381
- # Step 2: Compute anchored (in state_0's local frame)
382
- # anchored_p = R_0^T @ (p_target - p_0)
383
- displacement = p_target - p_0 # [T,3]
384
- anchored_p = (R_0_T @ displacement.T).T # [T,3]
385
-
386
- # anchored_R = R_0^T @ R_target (batched)
387
- R_0_T_expanded = R_0_T.unsqueeze(0).expand(R_target.shape[0], -1, -1) # [T,3,3]
388
- anchored_R = torch.bmm(R_0_T_expanded, R_target) # [T,3,3]
389
-
390
- return anchored_p, anchored_R, gripper
391
-
392
- def _convert_rotation_to_repr(self, rotation_matrix: torch.Tensor) -> torch.Tensor:
393
- """Convert rotation matrix to the desired representation.
394
-
395
- Args:
396
- rotation_matrix: Rotation matrices of shape (T, 3, 3).
397
-
398
- Returns:
399
- Rotation in the configured ``rotation_space`` format.
400
- """
401
- return convert_rotation(rotation_matrix, "matrix", libero_rotation_format(self.rotation_space))
402
-
403
- def _normalizer_filename(self) -> str:
404
- rotation_suffix = {
405
- "3d": "3d",
406
- "6d": "rot6d",
407
- "9d": "rot9d",
408
- }.get(self.rotation_space)
409
- if rotation_suffix is None:
410
- raise ValueError(f"Unsupported rotation_space={self.rotation_space!r}.")
411
- action_space = self.action_space.replace("-", "_")
412
- return f"{self._embodiment_type}_{action_space}_{rotation_suffix}.json"
413
-
414
- def _resolve_action_stats_path(self, action_stats_path: str | None) -> Path:
415
- if action_stats_path is None:
416
- stats_path = self._NORMALIZERS_DIR / self._normalizer_filename()
417
- if stats_path.exists():
418
- return stats_path
419
- raise FileNotFoundError(
420
- f"Could not find bundled LIBERO action stats at {stats_path}. "
421
- "Pass action_stats_path explicitly or regenerate stats with compute_action_stats.py."
422
- )
423
-
424
- stats_path = Path(action_stats_path)
425
- if stats_path.is_absolute():
426
- if stats_path.exists():
427
- return stats_path
428
- raise FileNotFoundError(f"Could not find action_stats_path={action_stats_path!r}.")
429
-
430
- module_dir = Path(__file__).resolve().parent
431
- candidates: list[Path] = []
432
- for parent in module_dir.parents:
433
- candidates.append(parent / stats_path)
434
- candidates.append(self._NORMALIZERS_DIR / stats_path.name)
435
- candidates.append(module_dir / stats_path.name)
436
- for candidate in candidates:
437
- if candidate.exists():
438
- return candidate
439
- raise FileNotFoundError(
440
- f"Could not resolve action_stats_path={action_stats_path!r}; tried: {[str(c) for c in candidates]}"
441
- )
442
-
443
- def _set_range_denormalization_stats(self) -> None:
444
- if self._norm_stats is None:
445
- return
446
-
447
- if self.action_normalization == "minmax":
448
- lo_key, hi_key = "min", "max"
449
- elif self.action_normalization in ("quantile", "quantile_rot"):
450
- lo_key, hi_key = "q01", "q99"
451
- else:
452
- return
453
-
454
- if lo_key not in self._norm_stats or hi_key not in self._norm_stats:
455
- raise ValueError(
456
- f"Action stats for {self.action_normalization!r} normalization require "
457
- f"{lo_key!r} and {hi_key!r} entries."
458
- )
459
- self.action_min = self._norm_stats[lo_key] # [D]
460
- self.action_max = self._norm_stats[hi_key] # [D]
461
- action_range = self.action_max - self.action_min # [D]
462
- self.action_range = torch.clamp(action_range, min=1e-6) # [D]
463
-
464
- def __getitem__(self, idx: int, _retry_count: int = 0) -> dict[str, torch.Tensor | str]:
465
- """Get a single item from the dataset."""
466
- max_retries = 10
467
- ds_idx, local_idx, ep_idx = self.index_map[idx]
468
- dataset = self.datasets[ds_idx]
469
- try:
470
- item = dataset[local_idx]
471
- except Exception as e:
472
- log.warning(
473
- f"Error loading item (retry {_retry_count}/{max_retries}): idx={idx}, ds_idx={ds_idx}, "
474
- f"local_idx={local_idx}, ep_idx={ep_idx}, repo_id={dataset.meta.repo_id}, error={e}"
475
- )
476
- if _retry_count >= max_retries:
477
- raise RuntimeError(f"Failed to load data after {max_retries} retries") from e
478
- new_idx = random.randint(0, len(self) - 1)
479
- return self.__getitem__(new_idx, _retry_count + 1)
480
-
481
- if self.mode == "joint":
482
- mode = random.choice(["forward_dynamics", "inverse_dynamics", "policy", "image2video"])
483
- else:
484
- mode = self.mode
485
-
486
- # Get task description for ai_caption
487
- task_description = self._get_task_description(ds_idx, item)
488
-
489
- # Process video based on camera mode (skipped entirely when
490
- # skip_video_loading=True; image keys are also absent from
491
- # delta_timestamps so LeRobot never decoded them).
492
- video: torch.Tensor | None
493
- if self._skip_video_loading:
494
- video = None
495
- else:
496
- if self.camera_mode == "concat_view":
497
- # Load both cameras and concatenate horizontally
498
- video_1: torch.Tensor = item["observation.images.image"]
499
- video_2: torch.Tensor = item["observation.images.wrist_image"]
500
-
501
- # Resize each if needed
502
- if video_1.shape[-1] != self.image_size or video_1.shape[-2] != self.image_size:
503
- video_1 = F.resize(video_1, [self.image_size, self.image_size])
504
- if video_2.shape[-1] != self.image_size or video_2.shape[-2] != self.image_size:
505
- video_2 = F.resize(video_2, [self.image_size, self.image_size])
506
-
507
- # Concatenate along width dimension (last dim for TCHW)
508
- video_tchw = torch.cat([video_1, video_2], dim=-1) # (T, C, H, W*2)
509
- else:
510
- # Single camera mode
511
- image_key = self.image_keys[0]
512
- video_tchw = item[image_key]
513
-
514
- # Resize if needed
515
- if video_tchw.shape[-1] != self.image_size or video_tchw.shape[-2] != self.image_size:
516
- video_tchw = F.resize(video_tchw, [self.image_size, self.image_size])
517
-
518
- # Convert to uint8 and transpose to (C, T, H, W)
519
- video = (video_tchw * 255).clamp(0, 255).to(torch.uint8).permute(1, 0, 2, 3)
520
-
521
- # Action (raw): LIBERO actions are 7D (6 DoF + gripper)
522
- action_raw: torch.Tensor = item["action"]
523
- # State (raw): LIBERO state is 8D (6 DoF + 2 gripper states)
524
- state_raw: torch.Tensor = item["observation.state"]
525
-
526
- # Action: (T+1, D) -> (T, D)
527
- # Take all but last action
528
- # LIBERO action format: [x, y, z, ax, ay, az, gripper] (7D) where (ax,ay,az) is axis-angle
529
-
530
- if self.action_space == "relative":
531
- # Compute anchored relative actions
532
- # Returns: translation (T, 3), rotation_matrix (T, 3, 3), gripper (T, 1)
533
- translation, rotation_matrix, gripper = self._compute_anchored_actions(state_raw, action_raw.clone())
534
- elif self.action_space == "frame_wise_relative":
535
- action = action_raw[:-1].clone() # [T,7]
536
- translation = action[:, :3] # [T,3]
537
- rotation_rotvec = action[:, 3:6] # [T,3]
538
- gripper = action[:, 6:] # [T,1]
539
- rotation_matrix = convert_rotation(
540
- rotation_rotvec, input_format="axisangle", output_format="matrix"
541
- ) # [T,3,3]
542
- else:
543
- raise ValueError(f"Unsupported action space: {self.action_space}")
544
-
545
- rotation = self._convert_rotation_to_repr(rotation_matrix) # [T,rot_dim]
546
- action = torch.cat([translation, rotation, gripper], dim=-1) # [T,action_dim]
547
-
548
- # Compute idle_frames from the raw (un-normalized) action, only when the
549
- # action layout has correct per-frame idle semantics (frame_wise_relative
550
- # ⇔ backward_framewise). The other action_spaces ("relative",
551
- # "absolute") encode per-frame motion differently and would not give
552
- # meaningful idle counts under the same threshold check.
553
- idle_frames: torch.Tensor | None = None
554
- if self.action_space == "frame_wise_relative":
555
- try:
556
- spec = build_action_spec(Pos(), Rot(libero_rotation_format(self.rotation_space)), Gripper())
557
- n = compute_idle_frames(action, spec)
558
- idle_frames = torch.tensor(n, dtype=torch.long)
559
- except (ValueError, TypeError):
560
- idle_frames = None
561
-
562
- if self.action_normalization is not None and self._norm_stats is not None and self.action_min is not None:
563
- if action.shape[-1] != self.action_min.shape[0]:
564
- raise ValueError(
565
- f"Action dimension {action.shape[-1]} does not match stats dimension "
566
- f"{self.action_min.shape[0]}. Recompute stats for the current "
567
- f"rotation_space={self.rotation_space!r} and action_space={self.action_space!r}."
568
- )
569
- method = "quantile" if self.action_normalization == "quantile_rot" else self.action_normalization
570
- action = normalize_action(action, method, self._norm_stats) # [T,D]
571
-
572
- # Index
573
- key = torch.tensor([local_idx], dtype=torch.long)
574
-
575
- if self.camera_mode == "image":
576
- viewpoint = "third_person_view"
577
- elif self.camera_mode == "wrist_image":
578
- viewpoint = "wrist_view"
579
- else:
580
- viewpoint = "concat_view"
581
-
582
- result: dict[str, torch.Tensor | str] = {
583
- "source_repo_id": dataset.meta.repo_id,
584
- "video": video,
585
- "action": action,
586
- "action_raw": action_raw,
587
- "conditioning_fps": torch.tensor(self.fps, dtype=torch.long),
588
- "prompt": task_description,
589
- "ai_caption": task_description,
590
- "mode": mode,
591
- "state": state_raw,
592
- "action_space": self.action_space,
593
- "rotation_space": self.rotation_space,
594
- "pose_coordinate_frame": self.pose_coordinate_frame,
595
- "__key__": key,
596
- "domain_id": torch.tensor(self.domain_id, dtype=torch.long),
597
- "viewpoint": viewpoint,
598
- }
599
- if idle_frames is not None:
600
- result["idle_frames"] = idle_frames
601
-
602
- if self.camera_mode == "concat_view" and not self._skip_video_loading:
603
- result["additional_view_description"] = (
604
- "The left half shows the third-person view; the right half shows the wrist-mounted camera."
605
- )
606
-
607
- return result
608
-
609
- @property
610
- def action_dim(self) -> int:
611
- return libero_action_dim(self.rotation_space)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-framework/cosmos_framework/data/vfm/action/libero_pose_utils.py DELETED
@@ -1,69 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: OpenMDW-1.1
3
-
4
- """Small LIBERO pose helpers shared by training and closed-loop eval."""
5
-
6
- from __future__ import annotations
7
-
8
- import numpy as np
9
- import torch
10
-
11
- from cosmos_framework.data.vfm.action.pose_utils import (
12
- RotationConvention,
13
- build_abs_pose_from_components,
14
- )
15
-
16
- # Same local-frame post-rotation pattern used by DROID/Bridge/Fractal:
17
- # R_opencv = R_native @ *_TO_OPENCV.
18
- LIBERO_TO_OPENCV: np.ndarray = np.array(
19
- [[0.0, -1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 1.0]],
20
- dtype=np.float32,
21
- )
22
-
23
- LIBERO_ROTATION_FORMATS: dict[str, RotationConvention] = {
24
- "3d": "axisangle",
25
- "6d": "rot6d",
26
- "9d": "rot9d",
27
- }
28
- LIBERO_ACTION_DIMS: dict[str, int] = {"3d": 7, "6d": 10, "9d": 13}
29
-
30
-
31
- def libero_rotation_format(rotation_space: str) -> RotationConvention:
32
- """Return the shared ``pose_utils`` rotation format for a LIBERO setting."""
33
- rotation_format = LIBERO_ROTATION_FORMATS.get(rotation_space)
34
- if rotation_format is None:
35
- raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.")
36
- return rotation_format
37
-
38
-
39
- def libero_action_dim(rotation_space: str) -> int:
40
- """Return ``[xyz, rotation, gripper]`` action width for LIBERO."""
41
- action_dim = LIBERO_ACTION_DIMS.get(rotation_space)
42
- if action_dim is None:
43
- raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.")
44
- return action_dim
45
-
46
-
47
- def libero_rotation_space_from_action_dim(action_dim: int) -> str:
48
- """Infer LIBERO rotation space from unpadded action width."""
49
- for rotation_space, dim in LIBERO_ACTION_DIMS.items():
50
- if dim == action_dim:
51
- return rotation_space
52
- raise ValueError(f"Unable to infer rotation_space from action_dim={action_dim}.")
53
-
54
-
55
- def build_libero_abs_pose(state_raw: torch.Tensor | np.ndarray, *, to_opencv: bool) -> np.ndarray:
56
- """Build absolute LIBERO EE poses from state rows.
57
-
58
- ``state_raw`` is ``[x,y,z,axisangle(3),gripper(2)]``. When requested, the
59
- local EE frame is post-rotated into the shared OpenCV-style action frame.
60
- """
61
- if isinstance(state_raw, torch.Tensor):
62
- state_np = state_raw.detach().cpu().numpy().astype(np.float32, copy=False)
63
- else:
64
- state_np = np.asarray(state_raw, dtype=np.float32)
65
-
66
- poses_abs = build_abs_pose_from_components(state_np[:, :3], state_np[:, 3:6], "axisangle")
67
- if to_opencv:
68
- poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ LIBERO_TO_OPENCV
69
- return poses_abs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/bridge_orig_lerobot_backward_framewise_rot6d.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "metadata": {
3
- "embodiment_type": "bridge_orig_lerobot",
4
- "pose_convention": "backward_framewise",
5
- "rotation_format": "rot6d",
6
- "action_dim": 10,
7
- "skip_rotation_dims": [3, 4, 5, 6, 7, 8],
8
- "chunk_length": 16,
9
- "sample_stride": 16,
10
- "dataset_name": "bridge_20260416",
11
- "dataset_class": "BridgeOrigLeRobotDataset",
12
- "dataset_root": "",
13
- "split": "train",
14
- "num_samples_stats": 83036,
15
- "reservoir_size": 5000000
16
- },
17
- "global": {
18
- "mean": [-0.000094, -0.000394, 0.001623, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.582683],
19
- "std": [ 0.013297, 0.009985, 0.012079, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.489959],
20
- "min": [-0.309451, -0.074740, -0.082767, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
21
- "max": [ 0.127018, 0.414660, 0.493186, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000],
22
- "q01": [-0.038884, -0.028667, -0.037840, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
23
- "q99": [ 0.039722, 0.029068, 0.026702, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000]
24
- },
25
- "global_raw": {
26
- "mean": [-0.000094, -0.000394, 0.001623, 0.998307, -0.001371, 0.000061, 0.001414, 0.998226, -0.000154, 0.582683],
27
- "std": [ 0.013297, 0.009985, 0.012079, 0.004630, 0.050168, 0.029018, 0.050165, 0.004328, 0.031742, 0.489959],
28
- "min": [-0.309451, -0.074740, -0.082767, -0.845782, -0.636628, -0.401535, -0.590214, -0.217448, -0.979635, 0.000000],
29
- "max": [ 0.127018, 0.414660, 0.493186, 1.000000, 0.362611, 0.601211, 0.619479, 1.000000, 0.365993, 1.000000],
30
- "q01": [-0.038884, -0.028667, -0.037840, 0.976292, -0.163098, -0.081545, -0.160193, 0.976322, -0.078872, 0.000000],
31
- "q99": [ 0.039722, 0.029068, 0.026702, 1.000000, 0.160195, 0.081655, 0.163227, 1.000000, 0.095189, 1.000000]
32
- }
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/droid_lerobot_backward_framewise_rot6d.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "metadata": {
3
- "embodiment_type": "droid_lerobot",
4
- "pose_convention": "backward_framewise",
5
- "rotation_format": "rot6d",
6
- "action_dim": 10,
7
- "skip_rotation_dims": [3, 4, 5, 6, 7, 8],
8
- "chunk_length": 16,
9
- "sample_stride": 16,
10
- "dataset_name": "droid_20260418",
11
- "dataset_class": "DROIDLeRobotDataset",
12
- "dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/droid_plus_lerobot_640x360_20260412",
13
- "split": "train",
14
- "num_samples_stats": 1321153,
15
- "reservoir_size": 5000000
16
- },
17
- "global": {
18
- "mean": [-0.000017, -0.000612, 0.000568, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.588911],
19
- "std": [ 0.004539, 0.004054, 0.004999, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.441186],
20
- "min": [-0.075397, -0.057288, -0.056677, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
21
- "max": [ 0.073107, 0.082187, 0.077080, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000],
22
- "q01": [-0.014200, -0.013416, -0.015206, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
23
- "q99": [ 0.014515, 0.011517, 0.014520, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000]
24
- },
25
- "global_raw": {
26
- "mean": [-0.000017, -0.000612, 0.000568, 0.999830, 0.000227, -0.000152, -0.000222, 0.999818, 0.000417, 0.588911],
27
- "std": [ 0.004539, 0.004054, 0.004999, 0.000336, 0.014924, 0.010784, 0.014927, 0.000351, 0.011903, 0.441186],
28
- "min": [-0.075397, -0.057288, -0.056677, 0.695640, -0.220599, -0.195892, -0.697421, 0.600468, -0.154176, 0.000000],
29
- "max": [ 0.073107, 0.082187, 0.077080, 1.000000, 0.698449, 0.168089, 0.220605, 1.000000, 0.391206, 1.000000],
30
- "q01": [-0.014200, -0.013416, -0.015206, 0.998459, -0.047659, -0.034774, -0.047609, 0.998428, -0.035553, 0.000000],
31
- "q99": [ 0.014515, 0.011517, 0.014520, 1.000000, 0.047596, 0.034660, 0.047654, 1.000000, 0.038888, 1.000000]
32
- }
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/fractal_backward_framewise_rot6d.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "metadata": {
3
- "embodiment_type": "fractal",
4
- "pose_convention": "backward_framewise",
5
- "rotation_format": "rot6d",
6
- "action_dim": 10,
7
- "skip_rotation_dims": [3, 4, 5, 6, 7, 8],
8
- "chunk_length": 16,
9
- "sample_stride": 16,
10
- "dataset_name": "fractal_20260413",
11
- "dataset_class": "FractalLeRobotDataset",
12
- "dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/fractal20220817_data_no_noops",
13
- "split": "train",
14
- "num_samples_stats": 166961,
15
- "reservoir_size": 5000000
16
- },
17
- "global": {
18
- "mean": [ 0.002259, 0.000721, 0.009372, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.526947],
19
- "std": [ 0.014178, 0.016428, 0.022554, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.499273],
20
- "min": [-0.151886, -0.176424, -0.194576, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
21
- "max": [ 0.130892, 0.190835, 0.193839, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000],
22
- "q01": [-0.039816, -0.049270, -0.056266, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
23
- "q99": [ 0.043860, 0.050352, 0.072505, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000]
24
- },
25
- "global_raw": {
26
- "mean": [ 0.002259, 0.000721, 0.009372, 0.998347, 0.001789, 0.002694, -0.001861, 0.997811, 0.016366, 0.526947],
27
- "std": [ 0.014178, 0.016428, 0.022554, 0.003377, 0.043416, 0.037369, 0.043211, 0.004566, 0.047057, 0.499273],
28
- "min": [-0.151886, -0.176424, -0.194576, 0.520558, -0.676280, -0.822475, -0.460521, 0.736643, -0.517041, 0.000000],
29
- "max": [ 0.130892, 0.190835, 0.193839, 1.000000, 0.461026, 0.403940, 0.671708, 1.000000, 0.505528, 1.000000],
30
- "q01": [-0.039816, -0.049270, -0.056266, 0.983667, -0.134543, -0.107048, -0.126518, 0.977277, -0.091363, 0.000000],
31
- "q99": [ 0.043860, 0.050352, 0.072505, 1.000000, 0.127404, 0.107273, 0.134140, 1.000000, 0.179731, 1.000000]
32
- }
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/libero_native_frame_wise_relative_rot6d.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "metadata": {
3
- "embodiment_type": "libero",
4
- "pose_convention": "frame_wise_relative",
5
- "pose_coordinate_frame": "native",
6
- "rotation_format": "6d",
7
- "action_dim": 10,
8
- "skip_rotation_dims": [3, 4, 5, 6, 7, 8],
9
- "chunk_length": 16,
10
- "sample_stride": null,
11
- "dataset_name": "libero",
12
- "dataset_class": "LIBERODataset",
13
- "dataset_root": ["outputs/libero_datasets/libero_10", "outputs/libero_datasets/libero_object", "outputs/libero_datasets/libero_spatial", "outputs/libero_datasets/libero_goal"],
14
- "_comment": "Dataset paths are placeholders; the statistics values are independent of local dataset location.",
15
- "split": "train",
16
- "num_samples_stats": 10000,
17
- "reservoir_size": 50000,
18
- "max_samples": 10000,
19
- "sampling_seed": 42
20
- },
21
- "global": {
22
- "mean": [ 0.050704, 0.097407, -0.094833, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.476725],
23
- "std": [ 0.333621, 0.387175, 0.457140, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.499460],
24
- "min": [-0.937500, -0.937500, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
25
- "max": [ 0.937500, 0.937500, 0.937500, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000],
26
- "q01": [-0.723214, -0.808929, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
27
- "q99": [ 0.937500, 0.870536, 0.937500, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000]
28
- },
29
- "global_raw": {
30
- "mean": [ 0.050704, 0.097407, -0.094833, 0.994873, -0.004579, -0.004288, 0.004389, 0.996104, 0.001109, 0.476725],
31
- "std": [ 0.333621, 0.387175, 0.457140, 0.010807, 0.077802, 0.063386, 0.078571, 0.009994, 0.038504, 0.499460],
32
- "min": [-0.937500, -0.937500, -0.937500, 0.902028, -0.356085, -0.367416, -0.370434, 0.921907, -0.255000, 0.000000],
33
- "max": [ 0.937500, 0.937500, 0.937500, 1.000000, 0.368853, 0.341214, 0.356395, 1.000000, 0.348251, 1.000000],
34
- "q01": [-0.723214, -0.808929, -0.937500, 0.934955, -0.223431, -0.189878, -0.334735, 0.938516, -0.107736, 0.000000],
35
- "q99": [ 0.937500, 0.870536, 0.937500, 1.000000, 0.331000, 0.163153, 0.226216, 1.000000, 0.127158, 1.000000]
36
- }
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka-dual_backward_framewise_rot6d.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "metadata": {
3
- "embodiment_type": "robomind-franka-dual",
4
- "pose_convention": "backward_framewise",
5
- "rotation_format": "rot6d",
6
- "action_dim": 20,
7
- "skip_rotation_dims": [3, 4, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18],
8
- "chunk_length": 16,
9
- "sample_stride": 16,
10
- "dataset_name": "robomind_franka_dual_20260414",
11
- "dataset_class": "RoboMINDFrankaDataset",
12
- "dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228",
13
- "split": "train",
14
- "num_samples_stats": 21410,
15
- "reservoir_size": 5000000
16
- },
17
- "global": {
18
- "mean": [ 0.000231, 0.000179, -0.000319, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.638652, 0.000148, -0.000377, -0.000241, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.815273],
19
- "std": [ 0.014881, 0.008081, 0.014371, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.464058, 0.010628, 0.005868, 0.007900, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.366049],
20
- "min": [-0.115093, -0.096415, -0.112595, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000, -0.091252, -0.052148, -0.113650, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
21
- "max": [ 0.114941, 0.063433, 0.098721, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.123908, 0.077951, 0.080229, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.999628],
22
- "q01": [-0.051367, -0.031964, -0.046482, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000, -0.035108, -0.021212, -0.029788, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
23
- "q99": [ 0.043729, 0.021737, 0.036738, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.047581, 0.021270, 0.025712, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.995443]
24
- },
25
- "global_raw": {
26
- "mean": [ 0.000231, 0.000179, -0.000319, 0.999350, 0.000318, -0.000135, -0.000286, 0.999586, -0.000051, 0.638652, 0.000148, -0.000377, -0.000241, 0.999294, 0.000343, 0.000431, -0.000147, 0.999580, -0.000570, 0.815273],
27
- "std": [ 0.014881, 0.008081, 0.014371, 0.002235, 0.020196, 0.029781, 0.020185, 0.001125, 0.020484, 0.464058, 0.010628, 0.005868, 0.007900, 0.002664, 0.025404, 0.027550, 0.025193, 0.001657, 0.014210, 0.366049],
28
- "min": [-0.115093, -0.096415, -0.112595, 0.944314, -0.271877, -0.325264, -0.254808, 0.962274, -0.227188, 0.000000, -0.091252, -0.052148, -0.113650, 0.941406, -0.265241, -0.273484, -0.290840, 0.954990, -0.264631, 0.000000],
29
- "max": [ 0.114941, 0.063433, 0.098721, 1.000000, 0.258475, 0.270230, 0.271943, 1.000000, 0.221936, 1.000000, 0.123908, 0.077951, 0.080229, 1.000000, 0.296517, 0.333596, 0.269131, 1.000000, 0.139695, 0.999628],
30
- "q01": [-0.051367, -0.031964, -0.046482, 0.988101, -0.053179, -0.128603, -0.075432, 0.994427, -0.059973, 0.000000, -0.035108, -0.021212, -0.029788, 0.986086, -0.098043, -0.111441, -0.093441, 0.991492, -0.058030, 0.000000],
31
- "q99": [ 0.043729, 0.021737, 0.036738, 1.000000, 0.075612, 0.102791, 0.053223, 1.000000, 0.077057, 1.000000, 0.047581, 0.021270, 0.025712, 1.000000, 0.095525, 0.126049, 0.098778, 1.000000, 0.041914, 0.995443]
32
- }
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka_backward_framewise_rot6d.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "metadata": {
3
- "embodiment_type": "robomind-franka",
4
- "pose_convention": "backward_framewise",
5
- "rotation_format": "rot6d",
6
- "action_dim": 10,
7
- "skip_rotation_dims": [3, 4, 5, 6, 7, 8],
8
- "chunk_length": 16,
9
- "sample_stride": 16,
10
- "dataset_name": "robomind_franka_20260414",
11
- "dataset_class": "RoboMINDFrankaDataset",
12
- "dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228",
13
- "split": "train",
14
- "num_samples_stats": 141658,
15
- "reservoir_size": 5000000
16
- },
17
- "global": {
18
- "mean": [ 0.000241, 0.000073, -0.000597, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.630501],
19
- "std": [ 0.020545, 0.010725, 0.022054, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.434021],
20
- "min": [-0.184377, -0.130924, -0.183947, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
21
- "max": [ 0.227682, 0.134118, 0.133222, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000],
22
- "q01": [-0.065029, -0.030683, -0.075321, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
23
- "q99": [ 0.068546, 0.036309, 0.051772, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000]
24
- },
25
- "global_raw": {
26
- "mean": [ 0.000241, 0.000073, -0.000597, 0.998782, 0.000605, 0.000003, -0.000592, 0.998245, -0.000508, 0.630501],
27
- "std": [ 0.020545, 0.010725, 0.022054, 0.004056, 0.043101, 0.023671, 0.043102, 0.004948, 0.040306, 0.434021],
28
- "min": [-0.184377, -0.130924, -0.183947, 0.837403, -0.525301, -0.384252, -0.543663, 0.801190, -0.490979, 0.000000],
29
- "max": [ 0.227682, 0.134118, 0.133222, 1.000000, 0.543800, 0.389145, 0.522029, 1.000000, 0.414190, 1.000000],
30
- "q01": [-0.065029, -0.030683, -0.075321, 0.981664, -0.137429, -0.069593, -0.140220, 0.976885, -0.140399, 0.000000],
31
- "q99": [ 0.068546, 0.036309, 0.051772, 1.000000, 0.140290, 0.079942, 0.137529, 1.000000, 0.113651, 1.000000]
32
- }
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-framework/cosmos_framework/data/vfm/action/robomind_franka_dataset.py CHANGED
@@ -25,7 +25,6 @@ import torch
25
  import torch.nn.functional as F
26
 
27
  from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
28
- ActionNormalization,
29
  ActionSpec,
30
  BaseActionLeRobotDataset,
31
  Gripper,
@@ -98,7 +97,6 @@ class RoboMINDFrankaDataset(BaseActionLeRobotDataset):
98
  mode: str = "policy",
99
  embodiment_type: str = "robomind-franka",
100
  pose_convention: str = "backward_framewise",
101
- action_normalization: ActionNormalization | None = None,
102
  viewpoint: Viewpoint = "concat_view",
103
  enable_fast_init: bool = False,
104
  ) -> None:
@@ -119,7 +117,6 @@ class RoboMINDFrankaDataset(BaseActionLeRobotDataset):
119
  viewpoint=viewpoint,
120
  pose_convention=pose_convention,
121
  rotation_format="rot6d",
122
- action_normalization=action_normalization,
123
  tolerance_s=1e-4,
124
  enable_fast_init=enable_fast_init,
125
  )
 
25
  import torch.nn.functional as F
26
 
27
  from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
 
28
  ActionSpec,
29
  BaseActionLeRobotDataset,
30
  Gripper,
 
97
  mode: str = "policy",
98
  embodiment_type: str = "robomind-franka",
99
  pose_convention: str = "backward_framewise",
 
100
  viewpoint: Viewpoint = "concat_view",
101
  enable_fast_init: bool = False,
102
  ) -> None:
 
117
  viewpoint=viewpoint,
118
  pose_convention=pose_convention,
119
  rotation_format="rot6d",
 
120
  tolerance_s=1e-4,
121
  enable_fast_init=enable_fast_init,
122
  )
cosmos-framework/cosmos_framework/data/vfm/action/umi_lerobot_dataset.py CHANGED
@@ -12,7 +12,7 @@ import torch
12
  from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
13
 
14
  from cosmos_framework.data.vfm.action.action_spec import ActionSpec, Gripper, Pos, Rot, build_action_spec
15
- from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import ActionNormalization, BaseActionLeRobotDataset
16
  from cosmos_framework.data.vfm.action.pose_utils import PoseConvention, build_abs_pose_from_components, pose_abs_to_rel
17
  from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
18
 
@@ -35,7 +35,6 @@ class UMIFastLeRobotDataset(BaseActionLeRobotDataset):
35
  split: Literal["train", "val", "full"] = "train",
36
  mode: str = "policy",
37
  pose_convention: PoseConvention = "backward_framewise",
38
- action_normalization: ActionNormalization | None = None,
39
  viewpoint: Viewpoint = "ego_view",
40
  enable_fast_init: bool = False,
41
  ) -> None:
@@ -50,7 +49,6 @@ class UMIFastLeRobotDataset(BaseActionLeRobotDataset):
50
  viewpoint=viewpoint,
51
  pose_convention=pose_convention,
52
  rotation_format="rot6d",
53
- action_normalization=action_normalization,
54
  tolerance_s=1e-4,
55
  enable_fast_init=enable_fast_init,
56
  )
 
12
  from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
13
 
14
  from cosmos_framework.data.vfm.action.action_spec import ActionSpec, Gripper, Pos, Rot, build_action_spec
15
+ from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import BaseActionLeRobotDataset
16
  from cosmos_framework.data.vfm.action.pose_utils import PoseConvention, build_abs_pose_from_components, pose_abs_to_rel
17
  from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
18
 
 
35
  split: Literal["train", "val", "full"] = "train",
36
  mode: str = "policy",
37
  pose_convention: PoseConvention = "backward_framewise",
 
38
  viewpoint: Viewpoint = "ego_view",
39
  enable_fast_init: bool = False,
40
  ) -> None:
 
49
  viewpoint=viewpoint,
50
  pose_convention=pose_convention,
51
  rotation_format="rot6d",
 
52
  tolerance_s=1e-4,
53
  enable_fast_init=enable_fast_init,
54
  )
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/README.md CHANGED
@@ -4,10 +4,15 @@ Interactive 3D viewer for release-supported robot action datasets. It renders
4
  RGB frames, end-effector trajectories, and robot meshes/IK with
5
  [`viser`](https://viser.studio/), MuJoCo, and Pinocchio.
6
 
7
- The OSS viewer registry is intentionally scoped to the public action datasets:
8
 
9
- - Bridge LeRobot v3: [`nvidia/bridge_lerobot_v3`](https://huggingface.co/datasets/nvidia/bridge_lerobot_v3)
10
- - LIBERO LeRobot v3: [`nvidia/LIBERO_LeRobot_v3`](https://huggingface.co/datasets/nvidia/LIBERO_LeRobot_v3)
 
 
 
 
 
11
 
12
  The dataset LazyConfig entries live in:
13
 
@@ -17,29 +22,10 @@ cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py
17
 
18
  ## Dataset roots
19
 
20
- Set dataset roots before launching. Dedicated variables are preferred when both
21
- datasets are available; `DATASET_PATH` remains a convenient fallback for a
22
- single downloaded dataset.
23
-
24
- ```bash
25
- # Bridge root: downloaded from nvidia/bridge_lerobot_v3 or an existing mirror.
26
- export BRIDGE_LEROBOT_ROOT=/path/to/bridge_lerobot_v3
27
-
28
- # LIBERO root: parent directory containing libero_10/, libero_object/,
29
- # libero_spatial/, and libero_goal/.
30
- export LIBERO_ROOT=/path/to/LIBERO_LeRobot_v3
31
- ```
32
-
33
- To download the public datasets instead:
34
-
35
- ```bash
36
- uvx hf@latest download --repo-type dataset nvidia/bridge_lerobot_v3 \
37
- --local-dir examples/data/bridge_lerobot_v3 --quiet
38
-
39
- uvx hf@latest download --repo-type dataset nvidia/LIBERO_LeRobot_v3 \
40
- --revision ddc1edeb6e51e2b7d4d2ba7a1433daaecd37aa64 \
41
- --local-dir examples/data/LIBERO_LeRobot_v3 --quiet
42
- ```
43
 
44
  ## Local launch
45
 
@@ -53,8 +39,6 @@ Then launch from the repository root:
53
 
54
  ```bash
55
  export PYTHONPATH=.
56
- export BRIDGE_LEROBOT_ROOT=/path/to/bridge_lerobot_v3
57
- export LIBERO_ROOT=/path/to/LIBERO_LeRobot_v3
58
 
59
  python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py \
60
  --port 8020 \
 
4
  RGB frames, end-effector trajectories, and robot meshes/IK with
5
  [`viser`](https://viser.studio/), MuJoCo, and Pinocchio.
6
 
7
+ The standalone viewer registry is intentionally scoped to the packaged demo datasets:
8
 
9
+ - AV
10
+ - Fractal
11
+ - Bridge
12
+ - DROID
13
+ - UMI / FastUMI
14
+ - RoboMIND Franka
15
+ - RoboMIND Franka dual
16
 
17
  The dataset LazyConfig entries live in:
18
 
 
22
 
23
  ## Dataset roots
24
 
25
+ The Docker Space uses packaged examples under `/app/assets/examples`. For local
26
+ development, override individual roots only when needed, e.g. `BRIDGE_ROOT`,
27
+ `FRACTAL_ROOT`, `DROID_ROOT`, `UMI_ROOT`, `ROBOMIND_FRANKA_ROOT`,
28
+ `ROBOMIND_FRANKA_DUAL_ROOT`, or `AV_ROOT`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  ## Local launch
31
 
 
39
 
40
  ```bash
41
  export PYTHONPATH=.
 
 
42
 
43
  python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py \
44
  --port 8020 \
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py CHANGED
@@ -49,7 +49,6 @@ BRIDGE_ROOT = _env_path("BRIDGE_LEROBOT_ROOT", "DATASET_PATH", default="/app/ass
49
  DATASET_BRIDGE_480 = L(dataset_entry)(
50
  name="bridge_20260501",
51
  dataset=L(BridgeOrigLeRobotDataset)(
52
- action_normalization="quantile_rot",
53
  chunk_length=16,
54
  enable_fast_init=True,
55
  fps=5.0,
@@ -72,7 +71,6 @@ DATASET_FRACTAL_256 = L(dataset_entry)(
72
  root=_env_path("FRACTAL_ROOT", default="/app/assets/examples/fractal20220817_data"),
73
  split="train",
74
  mode="joint",
75
- action_normalization="quantile_rot",
76
  enable_fast_init=False,
77
  ),
78
  ratio=1,
@@ -86,7 +84,6 @@ DATASET_DROID_480 = L(dataset_entry)(
86
  split="train",
87
  use_success_only=True,
88
  mode="joint",
89
- action_normalization="quantile_rot",
90
  enable_fast_init=False,
91
  ),
92
  ratio=1,
@@ -100,7 +97,6 @@ DATASET_ROBOMIND_FRANKA_480 = L(dataset_entry)(
100
  split="train",
101
  mode="joint",
102
  embodiment_type="robomind-franka",
103
- action_normalization="quantile_rot",
104
  enable_fast_init=False,
105
  ),
106
  ratio=1,
@@ -114,7 +110,6 @@ DATASET_ROBOMIND_FRANKA_DUAL_480 = L(dataset_entry)(
114
  split="train",
115
  mode="joint",
116
  embodiment_type="robomind-franka-dual",
117
- action_normalization="quantile_rot",
118
  enable_fast_init=False,
119
  ),
120
  ratio=1,
@@ -151,7 +146,6 @@ DATASET_UMI_256 = L(dataset_entry)(
151
  root=_env_path("UMI_ROOT", default="/app/assets/examples/fastumi/fastumi_single_arm/pour_coke"),
152
  split="train",
153
  mode="joint",
154
- action_normalization=None,
155
  enable_fast_init=False,
156
  ),
157
  ratio=1,
 
49
  DATASET_BRIDGE_480 = L(dataset_entry)(
50
  name="bridge_20260501",
51
  dataset=L(BridgeOrigLeRobotDataset)(
 
52
  chunk_length=16,
53
  enable_fast_init=True,
54
  fps=5.0,
 
71
  root=_env_path("FRACTAL_ROOT", default="/app/assets/examples/fractal20220817_data"),
72
  split="train",
73
  mode="joint",
 
74
  enable_fast_init=False,
75
  ),
76
  ratio=1,
 
84
  split="train",
85
  use_success_only=True,
86
  mode="joint",
 
87
  enable_fast_init=False,
88
  ),
89
  ratio=1,
 
97
  split="train",
98
  mode="joint",
99
  embodiment_type="robomind-franka",
 
100
  enable_fast_init=False,
101
  ),
102
  ratio=1,
 
110
  split="train",
111
  mode="joint",
112
  embodiment_type="robomind-franka-dual",
 
113
  enable_fast_init=False,
114
  ),
115
  ratio=1,
 
146
  root=_env_path("UMI_ROOT", default="/app/assets/examples/fastumi/fastumi_single_arm/pour_coke"),
147
  split="train",
148
  mode="joint",
 
149
  enable_fast_init=False,
150
  ),
151
  ratio=1,
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ik_solver.py CHANGED
@@ -575,8 +575,8 @@ def compute_mujoco_geom_transforms(
575
  for i, mi in enumerate(pin_to_mj_map):
576
  data.qpos[mi] = q[i]
577
  else:
578
- # For robots with a separate gripper ctrl signal (e.g. UR5e),
579
- # only write arm joints — the 7th column is a raw gripper value,
580
  # not a qpos DOF. For other robots the pinocchio output already
581
  # includes finger joints in the trailing columns; write them all.
582
  n_set = n_arm if has_gripper_ctrl else len(q)
 
575
  for i, mi in enumerate(pin_to_mj_map):
576
  data.qpos[mi] = q[i]
577
  else:
578
+ # For robots with a separate gripper ctrl signal, only write arm
579
+ # joints — the extra column is a raw gripper value,
580
  # not a qpos DOF. For other robots the pinocchio output already
581
  # includes finger joints in the trailing columns; write them all.
582
  n_set = n_arm if has_gripper_ctrl else len(q)
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_action.py CHANGED
@@ -1,33 +1,23 @@
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
  # SPDX-License-Identifier: OpenMDW-1.1
3
 
4
- """Canonical 57D action representation with explicit input formats.
5
 
6
- 57D layout::
7
-
8
- [ego(9) | R_wrist(9) | R_fingers(15) | L_wrist(9) | L_fingers(15)]
9
-
10
- Each 9D SE(3) slot is ``[pos(3) + rot6d(6)]``.
11
- Each finger slot is 3D (position in wrist-local frame), 5 fingers × 3D = 15D.
12
-
13
- Any supported action format is converted to ``UnifiedAction(action_57d, mask)``
14
- before the viewer processes it. The mask explicitly declares which slots are valid.
15
  """
16
 
17
  from __future__ import annotations
18
 
19
- from dataclasses import dataclass, field
20
  from enum import Enum
21
  from typing import Any
22
 
23
  import numpy as np
24
  import torch
25
 
26
- from cosmos_framework.utils import log
27
- from cosmos_framework.data.vfm.action.pose_utils import convert_rotation
28
-
29
- FINGER_NAMES = ("thumb", "index", "middle", "ring", "pinky")
30
- ALL_FINGERS = (True, True, True, True, True)
31
  NO_FINGERS = (False, False, False, False, False)
32
 
33
 
@@ -37,30 +27,15 @@ class ActionFormat(str, Enum):
37
  EGO_9D = "9d"
38
  SINGLE_ARM_10D = "10d"
39
  DUAL_ARM_20D = "20d"
40
- UNIFIED_57D = "57d"
41
 
42
  @property
43
  def expected_dim(self) -> int:
44
- """Return the exact trailing dimension required by this format."""
45
- return {
46
- ActionFormat.EGO_9D: 9,
47
- ActionFormat.SINGLE_ARM_10D: 10,
48
- ActionFormat.DUAL_ARM_20D: 20,
49
- ActionFormat.UNIFIED_57D: 57,
50
- }[self]
51
-
52
-
53
- # ─── Data Structures ─────────────────────────────────────────────────────────
54
 
55
 
56
  @dataclass
57
  class Action57DMask:
58
- """Per-component validity for the 57D layout.
59
-
60
- ``right_fingers`` / ``left_fingers`` are tuples of 5 bools
61
- (thumb, index, middle, ring, pinky) — supports any combination from
62
- 2-finger grippers to full 5-finger hands.
63
- """
64
 
65
  ego: bool = False
66
  right_wrist: bool = False
@@ -71,71 +46,41 @@ class Action57DMask:
71
 
72
  @dataclass
73
  class UnifiedAction:
74
- """Canonical 57D action for the viewer pipeline.
75
-
76
- ``action`` is always shape ``(T, 57)`` with invalid slots zero-padded.
77
- ``gripper_right`` / ``gripper_left`` carry auxiliary scalar gripper data
78
- for embodiments that don't map to finger positions (V-shape visualisation).
79
- """
80
 
81
- action: np.ndarray # (T, 57)
82
  mask: Action57DMask
83
- gripper_right: np.ndarray | None = None # (T,) scalar 0-1
84
- gripper_left: np.ndarray | None = None # (T,) scalar 0-1
85
 
86
 
87
  @dataclass
88
  class SceneState:
89
- """Render-ready world-space geometry reconstructed from ``UnifiedAction``.
90
 
91
- Contract:
92
- - all SE(3) trajectories live in one shared ``scene_world`` frame
93
- - fingertip positions are world-space if present
94
- - gripper signals are scalar open/close values sampled at ``T+1`` frames
95
- """
96
-
97
- mask: Action57DMask = field(default_factory=Action57DMask)
98
- # Absolute SE(3) trajectories — (T+1, 4, 4)
99
  ego_poses: np.ndarray | None = None
100
  right_poses: np.ndarray | None = None
101
  left_poses: np.ndarray | None = None
102
- # World-space fingertip positions — (T+1, 5, 3)
103
  right_fingers: np.ndarray | None = None
104
  left_fingers: np.ndarray | None = None
105
- # Scalar gripper — (T+1,)
106
  gripper_right: np.ndarray | None = None
107
  gripper_left: np.ndarray | None = None
108
- # Metadata
109
- video: np.ndarray | None = None # (T+1, H, W, 3) uint8
110
- action_raw: np.ndarray | None = None # canonical 57D action tensor for display
111
  T: int = 0
112
- # FK mesh animation: raw (T, nq) joint configs populated by datasets that
113
- # perform EE conversion internally (e.g. robomind-ur). When set, the renderer
114
- # uses these for FK mesh animation instead of running IK on right_poses.
115
- joint_configs: np.ndarray | None = None
116
-
117
 
118
- # ─── Converters ───────────────────────────────────────────────────────────────
119
 
120
-
121
- def to_unified_from_57d(action: np.ndarray) -> UnifiedAction:
122
- """57D hand_pose passthrough, all 5 slots valid."""
123
- return UnifiedAction(
124
- action=action.astype(np.float32),
125
- mask=Action57DMask(
126
- ego=True,
127
- right_wrist=True,
128
- right_fingers=ALL_FINGERS,
129
- left_wrist=True,
130
- left_fingers=ALL_FINGERS,
131
- ),
132
- )
133
 
134
 
135
  def to_unified_from_10d(action: np.ndarray) -> UnifiedAction:
136
  """10D single arm ``[pos(3)+rot6d(6)+grip(1)]`` → right wrist + gripper."""
137
- T = action.shape[0]
138
- a = np.zeros((T, 57), dtype=np.float32) # [T,57]
139
  a[:, 9:18] = action[:, :9]
140
  return UnifiedAction(
141
  action=a,
@@ -145,75 +90,50 @@ def to_unified_from_10d(action: np.ndarray) -> UnifiedAction:
145
 
146
 
147
  def to_unified_from_20d(action: np.ndarray) -> UnifiedAction:
148
- """20D dual arm ``[left(10) | right(10)]`` → both wrists + both grippers.
149
-
150
- Data layout: ``[L_pos(3) + L_rot6d(6) + L_grip(1) | R_pos(3) + R_rot6d(6) + R_grip(1)]``.
151
- Maps left arm → left wrist slot [33:42], right arm → right wrist slot [9:18].
152
- """
153
- T = action.shape[0]
154
- a = np.zeros((T, 57), dtype=np.float32) # [T,57]
155
- a[:, 33:42] = action[:, :9] # left arm → left wrist slot [33:42]
156
- a[:, 9:18] = action[:, 10:19] # right arm → right wrist slot [9:18]
157
  return UnifiedAction(
158
  action=a,
159
  mask=Action57DMask(right_wrist=True, left_wrist=True),
160
- gripper_right=action[:, 19].astype(np.float32), # right arm gripper
161
- gripper_left=action[:, 9].astype(np.float32), # left arm gripper
162
- )
163
-
164
-
165
- def to_unified_from_9d(action: np.ndarray) -> UnifiedAction:
166
- """9D camera/AV ``[pos(3)+rot6d(6)]`` → ego only."""
167
- T = action.shape[0]
168
- a = np.zeros((T, 57), dtype=np.float32) # [T,57]
169
- a[:, 0:9] = action[:, :9]
170
- return UnifiedAction(
171
- action=a,
172
- mask=Action57DMask(ego=True),
173
  )
174
 
175
 
176
  def _validate_action_shape(action: np.ndarray, action_format: ActionFormat) -> None:
177
- """Raise when a raw action tensor does not match its declared format."""
178
  if action.ndim != 2:
179
  raise ValueError(f"Expected a rank-2 action array, got shape {action.shape}")
180
- actual_dim = int(action.shape[-1])
181
  expected_dim = action_format.expected_dim
 
182
  if actual_dim != expected_dim:
183
  raise ValueError(f"Action format {action_format.value} expects trailing dim {expected_dim}, got {actual_dim}")
184
 
185
 
186
  def to_unified(action: np.ndarray, action_format: ActionFormat) -> UnifiedAction:
187
- """Convert one explicit raw action format into ``UnifiedAction``."""
188
  _validate_action_shape(action, action_format)
189
- if action_format is ActionFormat.UNIFIED_57D:
190
- return to_unified_from_57d(action)
191
- if action_format is ActionFormat.DUAL_ARM_20D:
192
- return to_unified_from_20d(action)
193
  if action_format is ActionFormat.EGO_9D:
194
  return to_unified_from_9d(action)
195
  if action_format is ActionFormat.SINGLE_ARM_10D:
196
  return to_unified_from_10d(action)
 
 
197
  raise ValueError(f"Unsupported action format: {action_format}")
198
 
199
 
200
  def _pos_rot6d_to_mat(se3: np.ndarray) -> np.ndarray:
201
  """Convert ``(N, 9)`` pos+rot6d to ``(N, 4, 4)`` SE(3) matrices."""
202
- N = se3.shape[0]
203
  pos = se3[:, :3]
204
  r6 = se3[:, 3:9]
205
-
206
  col0 = r6[:, :3].copy()
207
- col0_norm = np.linalg.norm(col0, axis=-1, keepdims=True) + 1e-8
208
- col0 = col0 / col0_norm
209
-
210
  col1 = r6[:, 3:6] - np.sum(r6[:, 3:6] * col0, axis=-1, keepdims=True) * col0
211
- col1_norm = np.linalg.norm(col1, axis=-1, keepdims=True) + 1e-8
212
- col1 = col1 / col1_norm
213
-
214
  col2 = np.cross(col0, col1)
215
-
216
- mats = np.tile(np.eye(4, dtype=np.float32), (N, 1, 1))
217
  mats[:, :3, 0] = col0
218
  mats[:, :3, 1] = col1
219
  mats[:, :3, 2] = col2
@@ -226,255 +146,77 @@ def _chain_se3(
226
  initial_pose: np.ndarray | None = None,
227
  pose_convention: str = "backward_framewise",
228
  ) -> np.ndarray:
229
- """Chain ``(T, 9)`` relative deltas into ``(T+1, 4, 4)`` absolute poses.
230
-
231
- For ``backward_framewise``: ``P_{t+1} = P_t @ delta_t``.
232
- For ``absolute``: each row is already an absolute pose (no chaining).
233
- """
234
- T = deltas.shape[0]
235
  delta_mats = _pos_rot6d_to_mat(deltas)
236
-
237
- if initial_pose is None:
238
- initial_pose = np.eye(4, dtype=np.float32)
239
- else:
240
- initial_pose = initial_pose.astype(np.float32)
241
-
242
- poses = np.empty((T + 1, 4, 4), dtype=np.float32)
243
- poses[0] = initial_pose
244
-
245
  if pose_convention == "absolute":
246
  poses[1:] = delta_mats
247
  else:
248
- for t in range(T):
249
  poses[t + 1] = poses[t] @ delta_mats[t]
250
-
251
  return poses
252
 
253
 
254
- def _extract_fingers(raw: np.ndarray) -> np.ndarray:
255
- """``(T, 15)`` ``(T+1, 5, 3)`` with first frame duplicated."""
256
- T = raw.shape[0]
257
- fingers = raw.reshape(T, 5, 3).astype(np.float32) # [T,5,3]
258
- return np.concatenate([fingers[:1], fingers], axis=0)
259
-
260
-
261
- def _to_numpy_float32(value: object) -> np.ndarray:
262
- """Convert a tensor-like value to a float32 NumPy array."""
263
-
264
- if isinstance(value, torch.Tensor):
265
- return value.detach().cpu().numpy().astype(np.float32)
266
- return np.asarray(value, dtype=np.float32)
267
-
268
-
269
- def _quat_xyzw_to_rotmat(q: np.ndarray) -> np.ndarray:
270
- """Convert ``(N, 4)`` xyzw quaternions to ``(N, 3, 3)`` rotation matrices."""
271
- x, y, z, w = q[:, 0], q[:, 1], q[:, 2], q[:, 3]
272
- R = np.zeros((len(q), 3, 3), dtype=np.float32)
273
- R[:, 0, 0] = 1 - 2 * (y * y + z * z)
274
- R[:, 0, 1] = 2 * (x * y - z * w)
275
- R[:, 0, 2] = 2 * (x * z + y * w)
276
- R[:, 1, 0] = 2 * (x * y + z * w)
277
- R[:, 1, 1] = 1 - 2 * (x * x + z * z)
278
- R[:, 1, 2] = 2 * (y * z - x * w)
279
- R[:, 2, 0] = 2 * (x * z - y * w)
280
- R[:, 2, 1] = 2 * (y * z + x * w)
281
- R[:, 2, 2] = 1 - 2 * (x * x + y * y)
282
- return R
283
-
284
-
285
- def _build_absolute_from_overlay(sample: dict) -> dict[str, np.ndarray] | None:
286
- """Build absolute world-frame poses from HandPoseDataset overlay data.
287
-
288
- Returns None if overlay keys are missing.
289
- """
290
- raw_cam_pos = sample.get("raw_cam_position")
291
- if raw_cam_pos is None:
292
- return None
293
-
294
- cam_pos = _to_numpy_float32(raw_cam_pos) # [T+1,3]
295
- cam_rot_q = _to_numpy_float32(sample["raw_cam_rotation"]) # [T+1,4]
296
- right_3d = _to_numpy_float32(sample["raw_cam_right_3d"]) # [T+1,63]
297
- left_3d = _to_numpy_float32(sample["raw_cam_left_3d"]) # [T+1,63]
298
- right_rot = _to_numpy_float32(sample["raw_cam_right_rot"]) # [T+1,84]
299
- left_rot = _to_numpy_float32(sample["raw_cam_left_rot"]) # [T+1,84]
300
-
301
- T1 = cam_pos.shape[0]
302
- FTIP = [4, 8, 12, 16, 20]
303
-
304
- # Camera c2w (world frame)
305
- cam_c2w = np.tile(np.eye(4, dtype=np.float32), (T1, 1, 1)) # [T+1,4,4]
306
- cam_c2w[:, :3, 3] = cam_pos
307
- cam_c2w[:, :3, :3] = _quat_xyzw_to_rotmat(cam_rot_q)
308
-
309
- def _wrist_world(pos_63, rot_84):
310
- wrist_pos = pos_63[:, :3]
311
- wrist_q = rot_84.reshape(T1, 21, 4)[:, 0]
312
- wrist_cam = np.tile(np.eye(4, dtype=np.float32), (T1, 1, 1)) # [T+1,4,4]
313
- wrist_cam[:, :3, 3] = wrist_pos
314
- wrist_cam[:, :3, :3] = _quat_xyzw_to_rotmat(wrist_q)
315
- return cam_c2w @ wrist_cam
316
-
317
- def _fingers_world(pos_63):
318
- joints = pos_63.reshape(T1, 21, 3)[:, FTIP]
319
- R = cam_c2w[:, :3, :3]
320
- t = cam_c2w[:, :3, 3]
321
- return np.einsum("tij,tfj->tfi", R, joints) + t[:, None, :] # [T+1,5,3]
322
-
323
- return {
324
- "ego_poses": cam_c2w,
325
- "right_wrist_poses": _wrist_world(right_3d, right_rot),
326
- "left_wrist_poses": _wrist_world(left_3d, left_rot),
327
- "right_fingers": _fingers_world(right_3d),
328
- "left_fingers": _fingers_world(left_3d),
329
- }
330
-
331
-
332
- def _build_libero_absolute_from_state(sample: dict) -> dict[str, np.ndarray] | None:
333
- """Build absolute LIBERO right-wrist poses from ``observation.state``.
334
-
335
- LIBERO policy actions are normalized robosuite controller commands rather
336
- than meter-scale SE(3) deltas. For visualization, the raw state sequence is
337
- the correct source of metric end-effector poses.
338
- """
339
- if sample.get("source_repo_id") is None or sample.get("state") is None:
340
- return None
341
- if "libero" not in str(sample.get("source_repo_id", "")).lower():
342
- return None
343
-
344
- from cosmos_framework.data.vfm.action.libero_pose_utils import build_libero_abs_pose
345
-
346
- to_opencv = str(sample.get("pose_coordinate_frame", "native")) == "opencv"
347
- right_poses = build_libero_abs_pose(sample["state"], to_opencv=to_opencv)
348
- return {"right_wrist_poses": right_poses.astype(np.float32, copy=False)}
349
-
350
-
351
- # ─── Scene State Builder ─────────────────────────────────────────────────────
352
 
353
 
354
  def build_scene_state(
355
  unified: UnifiedAction,
 
356
  initial_pose: np.ndarray | None = None,
357
  initial_pose_right: np.ndarray | None = None,
358
  initial_pose_left: np.ndarray | None = None,
359
  right_base_pose: np.ndarray | None = None,
360
  left_base_pose: np.ndarray | None = None,
361
  pose_convention: str = "backward_framewise",
362
- sample: dict | None = None,
363
  ) -> SceneState:
364
- """Reconstruct a canonical world-space ``SceneState`` from ``UnifiedAction``.
365
-
366
- Chains SE(3) deltas for valid mask slots. If ``sample`` contains overlay
367
- data (HandPoseDataset raw camera/joint fields), overrides with absolute
368
- world-frame poses.
369
-
370
- Args:
371
- unified: Canonical 57D action with mask.
372
- initial_pose: Default initial pose for all slots.
373
- initial_pose_right: Override for right wrist (dual arm).
374
- initial_pose_left: Override for left wrist (dual arm).
375
- right_base_pose: Right-arm base pose that maps arm-local trajectories into ``scene_world``.
376
- left_base_pose: Left-arm base pose that maps arm-local trajectories into ``scene_world``.
377
- pose_convention: Pose convention for SE(3) chaining.
378
- sample: Raw dataset sample (for overlay data).
379
- """
380
-
381
- def _apply_pose_base(poses: np.ndarray | None, base_pose: np.ndarray | None) -> np.ndarray | None:
382
- if poses is None or base_pose is None:
383
- return poses
384
- return np.einsum("ij,njk->nik", base_pose, poses).astype(np.float32) # [T+1,4,4]
385
-
386
- def _fingers_local_to_world(
387
- fingers_local: np.ndarray | None,
388
- wrist_poses_world: np.ndarray | None,
389
- ) -> np.ndarray | None:
390
- if fingers_local is None:
391
- return None
392
- if wrist_poses_world is None:
393
- raise ValueError("Finger trajectories require matching wrist poses to build world-space SceneState")
394
- wrist_rot = wrist_poses_world[:, :3, :3].astype(np.float32) # [T+1,3,3]
395
- wrist_pos = wrist_poses_world[:, :3, 3].astype(np.float32) # [T+1,3]
396
- return np.einsum("tij,tfj->tfi", wrist_rot, fingers_local) + wrist_pos[:, None, :] # [T+1,5,3]
397
-
398
- mask = unified.mask
399
  action = unified.action
400
- state = SceneState(mask=mask)
401
-
402
- ip_default = initial_pose if initial_pose is not None else np.eye(4, dtype=np.float32)
403
- ip_right = initial_pose_right if initial_pose_right is not None else ip_default
404
- ip_left = initial_pose_left if initial_pose_left is not None else ip_default
405
-
406
  if mask.ego:
407
- state.ego_poses = _chain_se3(action[:, 0:9], ip_default, pose_convention)
408
  if mask.right_wrist:
409
- state.right_poses = _chain_se3(action[:, 9:18], ip_right, pose_convention)
410
- if any(mask.right_fingers):
411
- state.right_fingers = _extract_fingers(action[:, 18:33])
 
 
412
  if mask.left_wrist:
413
- state.left_poses = _chain_se3(action[:, 33:42], ip_left, pose_convention)
414
- if any(mask.left_fingers):
415
- state.left_fingers = _extract_fingers(action[:, 42:57])
416
-
417
- if unified.gripper_right is not None:
418
- g = unified.gripper_right
419
- state.gripper_right = np.concatenate([[g[0]], g]).astype(np.float32, copy=False) # [T+1]
420
- if unified.gripper_left is not None:
421
- g = unified.gripper_left
422
- state.gripper_left = np.concatenate([[g[0]], g]).astype(np.float32, copy=False) # [T+1]
423
-
424
- abs_data = _build_absolute_from_overlay(sample) if sample is not None else None
425
- if abs_data is not None:
426
- state.ego_poses = abs_data["ego_poses"]
427
- state.right_poses = abs_data["right_wrist_poses"]
428
- state.left_poses = abs_data["left_wrist_poses"]
429
- state.right_fingers = abs_data["right_fingers"]
430
- state.left_fingers = abs_data["left_fingers"]
431
- log.info(
432
- f"Overlay absolute mode | ego range: "
433
- f"[{abs_data['ego_poses'][:, :3, 3].min():.3f}, "
434
- f"{abs_data['ego_poses'][:, :3, 3].max():.3f}] | "
435
- f"R wrist[0]: {abs_data['right_wrist_poses'][0, :3, 3]}"
436
  )
437
- else:
438
- state.right_poses = _apply_pose_base(state.right_poses, right_base_pose)
439
- state.left_poses = _apply_pose_base(state.left_poses, left_base_pose)
440
-
441
- libero_abs_data = _build_libero_absolute_from_state(sample) if sample is not None else None
442
- if libero_abs_data is not None:
443
- state.right_poses = libero_abs_data["right_wrist_poses"]
444
-
445
- state.right_fingers = _fingers_local_to_world(state.right_fingers, state.right_poses)
446
- state.left_fingers = _fingers_local_to_world(state.left_fingers, state.left_poses)
447
-
448
- state.action_raw = unified.action.astype(np.float32)
449
  state.T = action.shape[0]
450
  return state
451
 
452
 
453
- # ─── Video Extraction ─────────────────────────────────────────────────────────
454
-
455
-
456
  def get_video_from_sample(sample: dict) -> np.ndarray | None:
457
- """Extract video frames from a dataset sample.
458
-
459
- Returns ``(T+1, H, W, 3)`` uint8 array, or None.
460
- """
461
  video = sample.get("video")
462
  if video is None:
463
  return None
464
  if isinstance(video, torch.Tensor):
465
  video = video.numpy()
466
-
467
  if video.ndim == 4:
468
- C, T_dim, H, W = video.shape
469
- if C in (1, 3) and T_dim > 3:
470
  video = np.transpose(video, (1, 2, 3, 0))
471
- elif video.shape[1] in (1, 3) and T_dim <= 3:
472
  video = np.transpose(video, (0, 2, 3, 1))
473
-
474
  if video.dtype in (np.float32, np.float64):
475
  video = np.clip(video * 255, 0, 255).astype(np.uint8)
476
-
477
  if video.ndim == 4 and video.shape[-1] == 1:
478
  video = np.repeat(video, 3, axis=-1)
479
-
480
  return video
 
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
  # SPDX-License-Identifier: OpenMDW-1.1
3
 
4
+ """Raw action converters for the standalone action viewer.
5
 
6
+ The Space release is raw-action-only. It accepts 9D ego/camera actions, 10D
7
+ single-arm actions, and 20D dual-arm actions. The renderer still uses a fixed
8
+ 57D internal canvas so the downstream drawing code can stay format-agnostic,
9
+ but hand-pose/57D datasets are intentionally not part of this release.
 
 
 
 
 
10
  """
11
 
12
  from __future__ import annotations
13
 
14
+ from dataclasses import dataclass
15
  from enum import Enum
16
  from typing import Any
17
 
18
  import numpy as np
19
  import torch
20
 
 
 
 
 
 
21
  NO_FINGERS = (False, False, False, False, False)
22
 
23
 
 
27
  EGO_9D = "9d"
28
  SINGLE_ARM_10D = "10d"
29
  DUAL_ARM_20D = "20d"
 
30
 
31
  @property
32
  def expected_dim(self) -> int:
33
+ return {self.EGO_9D: 9, self.SINGLE_ARM_10D: 10, self.DUAL_ARM_20D: 20}[self]
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  @dataclass
37
  class Action57DMask:
38
+ """Per-component validity for the internal 57D rendering canvas."""
 
 
 
 
 
39
 
40
  ego: bool = False
41
  right_wrist: bool = False
 
46
 
47
  @dataclass
48
  class UnifiedAction:
49
+ """Internal 57D action plus scalar gripper side channels."""
 
 
 
 
 
50
 
51
+ action: np.ndarray
52
  mask: Action57DMask
53
+ gripper_right: np.ndarray | None = None
54
+ gripper_left: np.ndarray | None = None
55
 
56
 
57
  @dataclass
58
  class SceneState:
59
+ """Renderer-ready reconstructed trajectories for one sample."""
60
 
61
+ mask: Action57DMask
 
 
 
 
 
 
 
62
  ego_poses: np.ndarray | None = None
63
  right_poses: np.ndarray | None = None
64
  left_poses: np.ndarray | None = None
 
65
  right_fingers: np.ndarray | None = None
66
  left_fingers: np.ndarray | None = None
 
67
  gripper_right: np.ndarray | None = None
68
  gripper_left: np.ndarray | None = None
69
+ video: np.ndarray | None = None
70
+ action_raw: np.ndarray | None = None
 
71
  T: int = 0
 
 
 
 
 
72
 
 
73
 
74
+ def to_unified_from_9d(action: np.ndarray) -> UnifiedAction:
75
+ """9D camera/AV ``[pos(3)+rot6d(6)]`` ego only."""
76
+ a = np.zeros((action.shape[0], 57), dtype=np.float32)
77
+ a[:, 0:9] = action[:, :9]
78
+ return UnifiedAction(action=a, mask=Action57DMask(ego=True))
 
 
 
 
 
 
 
 
79
 
80
 
81
  def to_unified_from_10d(action: np.ndarray) -> UnifiedAction:
82
  """10D single arm ``[pos(3)+rot6d(6)+grip(1)]`` → right wrist + gripper."""
83
+ a = np.zeros((action.shape[0], 57), dtype=np.float32)
 
84
  a[:, 9:18] = action[:, :9]
85
  return UnifiedAction(
86
  action=a,
 
90
 
91
 
92
  def to_unified_from_20d(action: np.ndarray) -> UnifiedAction:
93
+ """20D dual arm ``[left(10) | right(10)]`` → both wrists + both grippers."""
94
+ a = np.zeros((action.shape[0], 57), dtype=np.float32)
95
+ a[:, 33:42] = action[:, :9]
96
+ a[:, 9:18] = action[:, 10:19]
 
 
 
 
 
97
  return UnifiedAction(
98
  action=a,
99
  mask=Action57DMask(right_wrist=True, left_wrist=True),
100
+ gripper_right=action[:, 19].astype(np.float32),
101
+ gripper_left=action[:, 9].astype(np.float32),
 
 
 
 
 
 
 
 
 
 
 
102
  )
103
 
104
 
105
  def _validate_action_shape(action: np.ndarray, action_format: ActionFormat) -> None:
 
106
  if action.ndim != 2:
107
  raise ValueError(f"Expected a rank-2 action array, got shape {action.shape}")
 
108
  expected_dim = action_format.expected_dim
109
+ actual_dim = int(action.shape[-1])
110
  if actual_dim != expected_dim:
111
  raise ValueError(f"Action format {action_format.value} expects trailing dim {expected_dim}, got {actual_dim}")
112
 
113
 
114
  def to_unified(action: np.ndarray, action_format: ActionFormat) -> UnifiedAction:
115
+ """Convert one supported raw action format into the internal render canvas."""
116
  _validate_action_shape(action, action_format)
 
 
 
 
117
  if action_format is ActionFormat.EGO_9D:
118
  return to_unified_from_9d(action)
119
  if action_format is ActionFormat.SINGLE_ARM_10D:
120
  return to_unified_from_10d(action)
121
+ if action_format is ActionFormat.DUAL_ARM_20D:
122
+ return to_unified_from_20d(action)
123
  raise ValueError(f"Unsupported action format: {action_format}")
124
 
125
 
126
  def _pos_rot6d_to_mat(se3: np.ndarray) -> np.ndarray:
127
  """Convert ``(N, 9)`` pos+rot6d to ``(N, 4, 4)`` SE(3) matrices."""
128
+ n = se3.shape[0]
129
  pos = se3[:, :3]
130
  r6 = se3[:, 3:9]
 
131
  col0 = r6[:, :3].copy()
132
+ col0 = col0 / (np.linalg.norm(col0, axis=-1, keepdims=True) + 1e-8)
 
 
133
  col1 = r6[:, 3:6] - np.sum(r6[:, 3:6] * col0, axis=-1, keepdims=True) * col0
134
+ col1 = col1 / (np.linalg.norm(col1, axis=-1, keepdims=True) + 1e-8)
 
 
135
  col2 = np.cross(col0, col1)
136
+ mats = np.tile(np.eye(4, dtype=np.float32), (n, 1, 1))
 
137
  mats[:, :3, 0] = col0
138
  mats[:, :3, 1] = col1
139
  mats[:, :3, 2] = col2
 
146
  initial_pose: np.ndarray | None = None,
147
  pose_convention: str = "backward_framewise",
148
  ) -> np.ndarray:
149
+ """Chain ``(T, 9)`` relative deltas into ``(T+1, 4, 4)`` absolute poses."""
 
 
 
 
 
150
  delta_mats = _pos_rot6d_to_mat(deltas)
151
+ initial = np.eye(4, dtype=np.float32) if initial_pose is None else initial_pose.astype(np.float32)
152
+ poses = np.empty((deltas.shape[0] + 1, 4, 4), dtype=np.float32)
153
+ poses[0] = initial
 
 
 
 
 
 
154
  if pose_convention == "absolute":
155
  poses[1:] = delta_mats
156
  else:
157
+ for t in range(deltas.shape[0]):
158
  poses[t + 1] = poses[t] @ delta_mats[t]
 
159
  return poses
160
 
161
 
162
+ def _apply_pose_base(poses: np.ndarray | None, base_pose: np.ndarray | None) -> np.ndarray | None:
163
+ if poses is None or base_pose is None:
164
+ return poses
165
+ return np.einsum("ij,tjk->tik", base_pose.astype(np.float32), poses).astype(np.float32)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
 
168
  def build_scene_state(
169
  unified: UnifiedAction,
170
+ *,
171
  initial_pose: np.ndarray | None = None,
172
  initial_pose_right: np.ndarray | None = None,
173
  initial_pose_left: np.ndarray | None = None,
174
  right_base_pose: np.ndarray | None = None,
175
  left_base_pose: np.ndarray | None = None,
176
  pose_convention: str = "backward_framewise",
177
+ sample: dict[str, Any] | None = None,
178
  ) -> SceneState:
179
+ """Reconstruct viewer trajectories from raw-action-derived ``UnifiedAction``."""
180
+ del sample
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  action = unified.action
182
+ mask = unified.mask
183
+ state = SceneState(mask=mask, gripper_right=unified.gripper_right, gripper_left=unified.gripper_left)
 
 
 
 
184
  if mask.ego:
185
+ state.ego_poses = _chain_se3(action[:, 0:9], initial_pose, pose_convention)
186
  if mask.right_wrist:
187
+ state.right_poses = _chain_se3(
188
+ action[:, 9:18],
189
+ initial_pose_right if initial_pose_right is not None else initial_pose,
190
+ pose_convention,
191
+ )
192
  if mask.left_wrist:
193
+ state.left_poses = _chain_se3(
194
+ action[:, 33:42],
195
+ initial_pose_left if initial_pose_left is not None else initial_pose,
196
+ pose_convention,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  )
198
+ state.right_poses = _apply_pose_base(state.right_poses, right_base_pose)
199
+ state.left_poses = _apply_pose_base(state.left_poses, left_base_pose)
200
+ state.action_raw = action.astype(np.float32)
 
 
 
 
 
 
 
 
 
201
  state.T = action.shape[0]
202
  return state
203
 
204
 
 
 
 
205
  def get_video_from_sample(sample: dict) -> np.ndarray | None:
206
+ """Extract video frames from a dataset sample as ``(T+1, H, W, 3)`` uint8."""
 
 
 
207
  video = sample.get("video")
208
  if video is None:
209
  return None
210
  if isinstance(video, torch.Tensor):
211
  video = video.numpy()
 
212
  if video.ndim == 4:
213
+ c, t_dim, _, _ = video.shape
214
+ if c in (1, 3) and t_dim > 3:
215
  video = np.transpose(video, (1, 2, 3, 0))
216
+ elif video.shape[1] in (1, 3) and t_dim <= 3:
217
  video = np.transpose(video, (0, 2, 3, 1))
 
218
  if video.dtype in (np.float32, np.float64):
219
  video = np.clip(video * 255, 0, 255).astype(np.uint8)
 
220
  if video.ndim == 4 and video.shape[-1] == 1:
221
  video = np.repeat(video, 3, axis=-1)
 
222
  return video
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_renderer.py CHANGED
@@ -15,7 +15,7 @@ import numpy as np
15
 
16
  from cosmos_framework.utils import log
17
  from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import RobotSceneModel
18
- from cosmos_framework.data.vfm.action.urdf_visualizer.unified_action import FINGER_NAMES, SceneState
19
 
20
 
21
  class UnifiedRenderer:
@@ -41,13 +41,7 @@ class UnifiedRenderer:
41
  COLOR_EGO_TOP = (231, 76, 60) # red
42
  COLOR_RIGHT = (243, 156, 18) # orange
43
  COLOR_LEFT = (155, 89, 182) # purple
44
- FINGER_COLORS = [
45
- (231, 76, 60),
46
- (241, 196, 15),
47
- (46, 204, 113),
48
- (52, 152, 219),
49
- (155, 89, 182),
50
- ]
51
 
52
  @staticmethod
53
  def _soften_color(color: tuple[int, int, int], mix: float = 0.35) -> tuple[int, int, int]:
@@ -135,23 +129,14 @@ class UnifiedRenderer:
135
  point_size=0.015,
136
  point_shape="circle",
137
  )
138
- self.right_fingers = [
139
- server.scene.add_icosphere(
140
- self._p(f"/right/finger_{FINGER_NAMES[i]}"),
141
- radius=self.TIP_RADIUS,
142
- color=self.FINGER_COLORS[i],
143
- position=(0.0, 0.0, 0.0),
144
- )
145
- for i in range(5)
146
- ]
147
  self.right_gripper_tips = [
148
  server.scene.add_icosphere(
149
  self._p(f"/right/gripper_tip_{side}"),
150
  radius=self.TIP_RADIUS,
151
- color=self.FINGER_COLORS[i],
152
  position=(0.0, 0.0, 0.0),
153
  )
154
- for i, side in enumerate(("thumb", "index"))
155
  ]
156
 
157
  # ── Left effector ──
@@ -171,23 +156,14 @@ class UnifiedRenderer:
171
  point_size=0.015,
172
  point_shape="circle",
173
  )
174
- self.left_fingers = [
175
- server.scene.add_icosphere(
176
- self._p(f"/left/finger_{FINGER_NAMES[i]}"),
177
- radius=self.TIP_RADIUS,
178
- color=self.FINGER_COLORS[i],
179
- position=(0.0, 0.0, 0.0),
180
- )
181
- for i in range(5)
182
- ]
183
  self.left_gripper_tips = [
184
  server.scene.add_icosphere(
185
  self._p(f"/left/gripper_tip_{side}"),
186
  radius=self.TIP_RADIUS,
187
- color=self.FINGER_COLORS[i],
188
  position=(0.0, 0.0, 0.0),
189
  )
190
- for i, side in enumerate(("thumb", "index"))
191
  ]
192
 
193
  # ── IK robot meshes ──
@@ -288,13 +264,6 @@ class UnifiedRenderer:
288
  self.right_traj,
289
  show,
290
  )
291
- self._update_fingers(
292
- t,
293
- state.right_fingers,
294
- mask.right_fingers,
295
- self.right_fingers,
296
- show,
297
- )
298
  self._update_gripper(
299
  t,
300
  state.right_poses,
@@ -315,13 +284,6 @@ class UnifiedRenderer:
315
  self.left_traj,
316
  show,
317
  )
318
- self._update_fingers(
319
- t,
320
- state.left_fingers,
321
- mask.left_fingers,
322
- self.left_fingers,
323
- show,
324
- )
325
  self._update_gripper(
326
  t,
327
  state.left_poses,
@@ -364,9 +326,6 @@ class UnifiedRenderer:
364
  def _v(active):
365
  return "✓" if active else "·"
366
 
367
- gr = a[18:33].reshape(5, 3)
368
- gl = a[42:57].reshape(5, 3)
369
-
370
  # Gripper auxiliary values (not in 57D vector)
371
  grip_r_str = ""
372
  grip_l_str = ""
@@ -383,10 +342,7 @@ class UnifiedRenderer:
383
  "",
384
  f"{_v(mask.right_wrist)} R wrist pos [9:12] {_fmt(a[9:12])}",
385
  f" {' ' * 1} rot [12:18] {_fmt(a[12:18])}",
386
- f" R fingers [18:33]",
387
  ]
388
- for i, name in enumerate(FINGER_NAMES):
389
- parts.append(f" {_v(mask.right_fingers[i])} {name:7s} {_fmt(gr[i])}")
390
  if grip_r_str:
391
  parts.append(grip_r_str)
392
 
@@ -394,10 +350,7 @@ class UnifiedRenderer:
394
  "",
395
  f"{_v(mask.left_wrist)} L wrist pos [33:36] {_fmt(a[33:36])}",
396
  f" {' ' * 1} rot [36:42] {_fmt(a[36:42])}",
397
- f" L fingers [42:57]",
398
  ]
399
- for i, name in enumerate(FINGER_NAMES):
400
- parts.append(f" {_v(mask.left_fingers[i])} {name:7s} {_fmt(gl[i])}")
401
  if grip_l_str:
402
  parts.append(grip_l_str)
403
 
@@ -476,26 +429,6 @@ class UnifiedRenderer:
476
  ee.visible = False
477
  traj.visible = False
478
 
479
- # ─── Private: Fingers ─────────────────────────────────────────────────────
480
-
481
- def _update_fingers(self, t, fingers, finger_mask, handles, show):
482
- if fingers is None or t >= len(fingers):
483
- for h in handles:
484
- h.visible = False
485
- return
486
- if not show.get("fingertips", True):
487
- for h in handles:
488
- h.visible = False
489
- return
490
-
491
- g = fingers[t] # (5, 3)
492
- for fi, h in enumerate(handles):
493
- if finger_mask[fi]:
494
- h.position = g[fi].astype(np.float32)
495
- h.visible = True
496
- else:
497
- h.visible = False
498
-
499
  # ─── Private: Gripper ─────────────────────────────────────────────────────
500
 
501
  def _update_gripper(self, t, poses, gripper, wrist_active, finger_mask, handle, show):
@@ -702,27 +635,6 @@ class UnifiedRenderer:
702
  if self._robot_scene_model is None:
703
  return
704
 
705
- # Joint-position datasets (e.g. robomind-ur): bypass IK, use FK directly
706
- if state.joint_configs is not None:
707
- from cosmos_framework.data.vfm.action.urdf_visualizer.ik_solver import compute_mujoco_geom_transforms
708
- from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import get_mjcf_path
709
-
710
- try:
711
- mjcf_path = get_mjcf_path(entry.robot_name)
712
- transforms, _, _fk_ee_poses, robot_frames = compute_mujoco_geom_transforms(
713
- mjcf_path, state.joint_configs
714
- )
715
- self._ik_right = transforms
716
- self._robot_frames_right = robot_frames
717
- self._rebuild_robot_frame_handles("right", robot_frames)
718
- log.info(f"FK geom transforms computed for {len(transforms)} frames ({entry.robot_name})")
719
- except Exception as e:
720
- log.warning(f"FK failed for {entry.robot_name}: {e}")
721
- import traceback
722
-
723
- traceback.print_exc()
724
- return
725
-
726
  # Right arm IK
727
  if state.right_poses is not None:
728
  try:
@@ -807,8 +719,6 @@ class UnifiedRenderer:
807
  self.left_traj,
808
  ]:
809
  attr.visible = False
810
- for h in self.right_fingers + self.left_fingers:
811
- h.visible = False
812
  for h in self.right_gripper_tips + self.left_gripper_tips:
813
  h.visible = False
814
  for h in self.robot_right + self.robot_left:
 
15
 
16
  from cosmos_framework.utils import log
17
  from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import RobotSceneModel
18
+ from cosmos_framework.data.vfm.action.urdf_visualizer.unified_action import SceneState
19
 
20
 
21
  class UnifiedRenderer:
 
41
  COLOR_EGO_TOP = (231, 76, 60) # red
42
  COLOR_RIGHT = (243, 156, 18) # orange
43
  COLOR_LEFT = (155, 89, 182) # purple
44
+ GRIPPER_TIP_COLORS = [(231, 76, 60), (241, 196, 15)]
 
 
 
 
 
 
45
 
46
  @staticmethod
47
  def _soften_color(color: tuple[int, int, int], mix: float = 0.35) -> tuple[int, int, int]:
 
129
  point_size=0.015,
130
  point_shape="circle",
131
  )
 
 
 
 
 
 
 
 
 
132
  self.right_gripper_tips = [
133
  server.scene.add_icosphere(
134
  self._p(f"/right/gripper_tip_{side}"),
135
  radius=self.TIP_RADIUS,
136
+ color=self.GRIPPER_TIP_COLORS[i],
137
  position=(0.0, 0.0, 0.0),
138
  )
139
+ for i, side in enumerate(("left", "right"))
140
  ]
141
 
142
  # ── Left effector ──
 
156
  point_size=0.015,
157
  point_shape="circle",
158
  )
 
 
 
 
 
 
 
 
 
159
  self.left_gripper_tips = [
160
  server.scene.add_icosphere(
161
  self._p(f"/left/gripper_tip_{side}"),
162
  radius=self.TIP_RADIUS,
163
+ color=self.GRIPPER_TIP_COLORS[i],
164
  position=(0.0, 0.0, 0.0),
165
  )
166
+ for i, side in enumerate(("left", "right"))
167
  ]
168
 
169
  # ── IK robot meshes ──
 
264
  self.right_traj,
265
  show,
266
  )
 
 
 
 
 
 
 
267
  self._update_gripper(
268
  t,
269
  state.right_poses,
 
284
  self.left_traj,
285
  show,
286
  )
 
 
 
 
 
 
 
287
  self._update_gripper(
288
  t,
289
  state.left_poses,
 
326
  def _v(active):
327
  return "✓" if active else "·"
328
 
 
 
 
329
  # Gripper auxiliary values (not in 57D vector)
330
  grip_r_str = ""
331
  grip_l_str = ""
 
342
  "",
343
  f"{_v(mask.right_wrist)} R wrist pos [9:12] {_fmt(a[9:12])}",
344
  f" {' ' * 1} rot [12:18] {_fmt(a[12:18])}",
 
345
  ]
 
 
346
  if grip_r_str:
347
  parts.append(grip_r_str)
348
 
 
350
  "",
351
  f"{_v(mask.left_wrist)} L wrist pos [33:36] {_fmt(a[33:36])}",
352
  f" {' ' * 1} rot [36:42] {_fmt(a[36:42])}",
 
353
  ]
 
 
354
  if grip_l_str:
355
  parts.append(grip_l_str)
356
 
 
429
  ee.visible = False
430
  traj.visible = False
431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  # ─── Private: Gripper ─────────────────────────────────────────────────────
433
 
434
  def _update_gripper(self, t, poses, gripper, wrist_active, finger_mask, handle, show):
 
635
  if self._robot_scene_model is None:
636
  return
637
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
638
  # Right arm IK
639
  if state.right_poses is not None:
640
  try:
 
719
  self.left_traj,
720
  ]:
721
  attr.visible = False
 
 
722
  for h in self.right_gripper_tips + self.left_gripper_tips:
723
  h.visible = False
724
  for h in self.robot_right + self.robot_left:
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ur5e_robotiq_2f85.xml DELETED
@@ -1,326 +0,0 @@
1
- <!--
2
- SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
- SPDX-License-Identifier: OpenMDW-1.1
4
- -->
5
-
6
- <!-- UR5e + Robotiq 2F-85 gripper composite MJCF.
7
- Created for RoboMIND UR visualization.
8
- Arm (UR5e): qpos[0:6] — shoulder_pan, shoulder_lift, elbow, wrist_1, wrist_2, wrist_3
9
- Gripper (Robotiq 2F-85): controlled via fingers_actuator tendon (ctrlrange 0-255).
10
- Gripper signal from dataset: 0=open, 1=closed; scale by 255 before setting ctrl.
11
- Mesh assets from MuJoCo Menagerie:
12
- universal_robots_ur5e/assets/ (UR5e OBJ meshes)
13
- robotiq_2f85_v4/assets/ (Robotiq STL meshes)
14
- Both are merged into a single assets/ directory by _build_ur5e_robotiq().
15
- -->
16
- <mujoco model="ur5e with robotiq 2f85">
17
- <compiler angle="radian" meshdir="assets" autolimits="true"/>
18
-
19
- <option integrator="implicitfast"/>
20
-
21
- <default>
22
- <default class="ur5e">
23
- <material specular="0.5" shininess="0.25"/>
24
- <joint axis="0 1 0" range="-6.28319 6.28319" armature="0.1"/>
25
- <general gaintype="fixed" biastype="affine" ctrlrange="-6.2831 6.2831" gainprm="2000"
26
- biasprm="0 -2000 -400" forcerange="-150 150"/>
27
- <default class="size3">
28
- <default class="size3_limited">
29
- <joint range="-3.1415 3.1415"/>
30
- <general ctrlrange="-3.1415 3.1415"/>
31
- </default>
32
- </default>
33
- <default class="size1">
34
- <general gainprm="500" biasprm="0 -500 -100" forcerange="-28 28"/>
35
- </default>
36
- <default class="visual">
37
- <geom type="mesh" contype="0" conaffinity="0" group="2"/>
38
- </default>
39
- <default class="collision">
40
- <geom type="capsule" group="3"/>
41
- <default class="eef_collision">
42
- <geom type="cylinder"/>
43
- </default>
44
- </default>
45
- <site size="0.001" rgba="0.5 0.5 0.5 0.3" group="4"/>
46
- </default>
47
-
48
- <default class="2f85">
49
- <mesh scale="0.001 0.001 0.001"/>
50
- <general biastype="affine"/>
51
- <joint axis="0 0 1"/>
52
- <default class="driver">
53
- <joint range="0 0.9" armature="0.005" damping="0.1"
54
- solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
55
- </default>
56
- <default class="follower">
57
- <joint range="-0.872664 0.9" armature="0.001"
58
- solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
59
- </default>
60
- <default class="spring_link">
61
- <joint range="-0.29670597283 0.9" armature="0.001"
62
- stiffness="0.05" springref="2.62" damping="0.00125"/>
63
- </default>
64
- <default class="coupler">
65
- <joint range="-1.57 0" armature="0.001"
66
- solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
67
- </default>
68
- <default class="visual_gripper">
69
- <geom type="mesh" contype="0" conaffinity="0" group="2" material="robotiq_black"/>
70
- </default>
71
- <default class="collision_gripper">
72
- <geom group="3" type="mesh" contype="0" conaffinity="0"/>
73
- <default class="pad_box1">
74
- <geom mass="0.00175" type="box" pos="0.043258 0 0.12"
75
- size="0.002 0.011 0.009375" friction="0.7"
76
- solimp="0.95 0.99 0.001" solref="0.004 1" priority="1"
77
- rgba="0.55 0.55 0.55 1"/>
78
- </default>
79
- <default class="pad_box2">
80
- <geom mass="0.00175" type="box" pos="0.043258 0 0.13875"
81
- size="0.002 0.011 0.009375" friction="0.6"
82
- solimp="0.95 0.99 0.001" solref="0.004 1" priority="1"
83
- rgba="0.45 0.45 0.45 1"/>
84
- </default>
85
- </default>
86
- </default>
87
- </default>
88
-
89
- <asset>
90
- <!-- UR5e materials -->
91
- <material class="ur5e" name="ur5e_black" rgba="0.033 0.033 0.033 1"/>
92
- <material class="ur5e" name="jointgray" rgba="0.278 0.278 0.278 1"/>
93
- <material class="ur5e" name="linkgray" rgba="0.82 0.82 0.82 1"/>
94
- <material class="ur5e" name="urblue" rgba="0.49 0.678 0.8 1"/>
95
-
96
- <!-- Robotiq materials -->
97
- <material name="robotiq_metal" rgba="0.58 0.58 0.58 1"/>
98
- <material name="robotiq_silicone" rgba="0.1882 0.1882 0.1882 1"/>
99
- <material name="robotiq_black" rgba="0.149 0.149 0.149 1"/>
100
-
101
- <!-- UR5e meshes -->
102
- <mesh file="base_0.obj"/>
103
- <mesh file="base_1.obj"/>
104
- <mesh file="shoulder_0.obj"/>
105
- <mesh file="shoulder_1.obj"/>
106
- <mesh file="shoulder_2.obj"/>
107
- <mesh file="upperarm_0.obj"/>
108
- <mesh file="upperarm_1.obj"/>
109
- <mesh file="upperarm_2.obj"/>
110
- <mesh file="upperarm_3.obj"/>
111
- <mesh file="forearm_0.obj"/>
112
- <mesh file="forearm_1.obj"/>
113
- <mesh file="forearm_2.obj"/>
114
- <mesh file="forearm_3.obj"/>
115
- <mesh file="wrist1_0.obj"/>
116
- <mesh file="wrist1_1.obj"/>
117
- <mesh file="wrist1_2.obj"/>
118
- <mesh file="wrist2_0.obj"/>
119
- <mesh file="wrist2_1.obj"/>
120
- <mesh file="wrist2_2.obj"/>
121
- <mesh file="wrist3.obj"/>
122
-
123
- <!-- Robotiq 2F-85 meshes — NO class: STL assets are in meters, scale=0.001 must NOT apply -->
124
- <mesh file="base.stl"/>
125
- <mesh file="base_coupling.stl"/>
126
- <mesh file="c-a01-85-open.stl"/>
127
- <mesh file="driver.stl"/>
128
- <mesh file="coupler.stl"/>
129
- <mesh file="spring_link.stl"/>
130
- <mesh file="follower.stl"/>
131
- <mesh file="tongue.stl"/>
132
- </asset>
133
-
134
- <worldbody>
135
- <light name="spotlight" mode="targetbodycom" target="wrist_2_link" pos="0 -1 2"/>
136
- <body name="base" quat="0 0 0 -1" childclass="ur5e">
137
- <inertial mass="4.0" pos="0 0 0" diaginertia="0.00443333156 0.00443333156 0.0072"/>
138
- <geom mesh="base_0" material="ur5e_black" class="visual"/>
139
- <geom mesh="base_1" material="jointgray" class="visual"/>
140
- <body name="shoulder_link" pos="0 0 0.163">
141
- <inertial mass="3.7" pos="0 0 0" diaginertia="0.0102675 0.0102675 0.00666"/>
142
- <joint name="shoulder_pan_joint" class="size3" axis="0 0 1"/>
143
- <geom mesh="shoulder_0" material="urblue" class="visual"/>
144
- <geom mesh="shoulder_1" material="ur5e_black" class="visual"/>
145
- <geom mesh="shoulder_2" material="jointgray" class="visual"/>
146
- <geom class="collision" size="0.06 0.06" pos="0 0 -0.04"/>
147
- <body name="upper_arm_link" pos="0 0.138 0" quat="1 0 1 0">
148
- <inertial mass="8.393" pos="0 0 0.2125" diaginertia="0.133886 0.133886 0.0151074"/>
149
- <joint name="shoulder_lift_joint" class="size3"/>
150
- <geom mesh="upperarm_0" material="linkgray" class="visual"/>
151
- <geom mesh="upperarm_1" material="ur5e_black" class="visual"/>
152
- <geom mesh="upperarm_2" material="jointgray" class="visual"/>
153
- <geom mesh="upperarm_3" material="urblue" class="visual"/>
154
- <geom class="collision" pos="0 -0.04 0" quat="1 1 0 0" size="0.06 0.06"/>
155
- <geom class="collision" size="0.05 0.2" pos="0 0 0.2"/>
156
- <body name="forearm_link" pos="0 -0.131 0.425">
157
- <inertial mass="2.275" pos="0 0 0.196" diaginertia="0.0311796 0.0311796 0.004095"/>
158
- <joint name="elbow_joint" class="size3_limited"/>
159
- <geom mesh="forearm_0" material="urblue" class="visual"/>
160
- <geom mesh="forearm_1" material="linkgray" class="visual"/>
161
- <geom mesh="forearm_2" material="ur5e_black" class="visual"/>
162
- <geom mesh="forearm_3" material="jointgray" class="visual"/>
163
- <geom class="collision" pos="0 0.08 0" quat="1 1 0 0" size="0.055 0.06"/>
164
- <geom class="collision" size="0.038 0.19" pos="0 0 0.2"/>
165
- <body name="wrist_1_link" pos="0 0 0.392" quat="1 0 1 0">
166
- <inertial mass="1.219" pos="0 0.127 0" diaginertia="0.0025599 0.0025599 0.0021942"/>
167
- <joint name="wrist_1_joint" class="size1"/>
168
- <geom mesh="wrist1_0" material="ur5e_black" class="visual"/>
169
- <geom mesh="wrist1_1" material="urblue" class="visual"/>
170
- <geom mesh="wrist1_2" material="jointgray" class="visual"/>
171
- <geom class="collision" pos="0 0.05 0" quat="1 1 0 0" size="0.04 0.07"/>
172
- <body name="wrist_2_link" pos="0 0.127 0">
173
- <inertial mass="1.219" pos="0 0 0.1" diaginertia="0.0025599 0.0025599 0.0021942"/>
174
- <joint name="wrist_2_joint" axis="0 0 1" class="size1"/>
175
- <geom mesh="wrist2_0" material="ur5e_black" class="visual"/>
176
- <geom mesh="wrist2_1" material="urblue" class="visual"/>
177
- <geom mesh="wrist2_2" material="jointgray" class="visual"/>
178
- <geom class="collision" size="0.04 0.06" pos="0 0 0.04"/>
179
- <geom class="collision" pos="0 0.02 0.1" quat="1 1 0 0" size="0.04 0.04"/>
180
- <body name="wrist_3_link" pos="0 0 0.1">
181
- <inertial mass="0.1889" pos="0 0.0771683 0" quat="1 0 0 1"
182
- diaginertia="0.000132134 9.90863e-05 9.90863e-05"/>
183
- <joint name="wrist_3_joint" class="size1"/>
184
- <geom material="linkgray" mesh="wrist3" class="visual"/>
185
- <geom class="eef_collision" pos="0 0.08 0" quat="1 1 0 0" size="0.04 0.02"/>
186
- <site name="attachment_site" pos="0 0.1 0" quat="-1 1 0 0"/>
187
- <!-- Robotiq 2F-85 attached at the UR5e flange site -->
188
- <body name="robotiq_base" childclass="2f85" pos="0 0.1 0" quat="-1 1 0 0">
189
- <inertial mass="0.777441" pos="7.77116e-05 8.42713e-05 0.0311656"
190
- quat="0.704758 -0.00373684 -0.00570287 0.709415"
191
- diaginertia="0.000260285 0.000225381 0.000152708"/>
192
- <geom class="visual_gripper" pos="0 0 0.0108" quat="0 0 0 1" mesh="base"/>
193
- <geom class="visual_gripper" pos="0 0 0.004" quat="1 -1 0 0" mesh="base_coupling" material="robotiq_metal"/>
194
- <geom class="visual_gripper" pos="0 0 0.0108" quat="1 0 0 0" material="robotiq_metal" mesh="c-a01-85-open"/>
195
- <geom class="collision_gripper" pos="0 0 0.0108" quat="0 0 0 1" mesh="base"/>
196
- <body name="robotiq_left_driver" pos="-0.0306011 0.00475 0.0657045" quat="1 -1 0 0">
197
- <inertial mass="0.00899563" pos="-0.0175297 0.00165308 -0.00469625"
198
- quat="-0.469642 0.469642 -0.528617 0.528617"
199
- diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
200
- <joint name="left_driver_joint" class="driver"/>
201
- <geom class="visual_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" material="robotiq_metal" mesh="driver"/>
202
- <geom class="collision_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" mesh="driver"/>
203
- <body name="robotiq_left_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
204
- <inertial mass="0.0140974" pos="0.00367747 0.01986 0.0055"
205
- quat="0.701447 -0.701447 0.0892884 -0.0892884"
206
- diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
207
- <geom class="visual_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
208
- <geom class="collision_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
209
- </body>
210
- </body>
211
- <body name="robotiq_left_spring_link" pos="-0.0127 -0.012 0.07222" quat="1 -1 0 0">
212
- <inertial mass="0.0221642" pos="-0.0183 -0.0205732 0.01205"
213
- quat="0.660941 0.660941 -0.251309 -0.251309"
214
- diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
215
- <joint name="left_spring_link_joint" class="spring_link"/>
216
- <geom class="visual_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
217
- <geom class="collision_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
218
- <body name="robotiq_left_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 -1.90231e-05 0">
219
- <inertial mass="0.0125222" pos="-0.00852976 -0.0014822 -0.00910001"
220
- quat="0.359439 0.359439 0.608937 0.608937"
221
- diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
222
- <joint name="left_follower_joint" class="follower"/>
223
- <geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
224
- <geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" material="robotiq_metal" mesh="tongue"/>
225
- <geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
226
- <geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="tongue"/>
227
- <body name="robotiq_left_pad" pos="-0.0377897 -0.103916 -0.0091" quat="1 -1 0 0">
228
- <geom class="pad_box1" name="left_pad1"/>
229
- <geom class="pad_box2" name="left_pad2"/>
230
- </body>
231
- </body>
232
- </body>
233
- <body name="robotiq_right_driver" pos="0.0306011 -0.00475 0.0657045" quat="0 0 -1 1">
234
- <inertial mass="0.00899563" pos="-0.0175297 0.00165308 -0.00469625"
235
- quat="-0.469642 0.469642 -0.528617 0.528617"
236
- diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
237
- <joint name="right_driver_joint" class="driver"/>
238
- <geom class="visual_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" material="robotiq_metal" mesh="driver"/>
239
- <geom class="collision_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" mesh="driver"/>
240
- <body name="robotiq_right_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
241
- <inertial mass="0.0140974" pos="0.00367747 0.01986 0.0055"
242
- quat="0.701447 -0.701447 0.0892884 -0.0892884"
243
- diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
244
- <geom class="visual_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
245
- <geom class="collision_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
246
- </body>
247
- </body>
248
- <body name="robotiq_right_spring_link" pos="0.0127 0.012 0.07222" quat="0 0 -1 1">
249
- <inertial mass="0.0221642" pos="-0.0183 -0.0205732 0.01205"
250
- quat="0.660941 0.660941 -0.251309 -0.251309"
251
- diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
252
- <joint name="right_spring_link_joint" class="spring_link"/>
253
- <geom class="visual_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
254
- <geom class="collision_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
255
- <body name="robotiq_right_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 0 0">
256
- <inertial mass="0.0125222" pos="-0.00852976 -0.0014822 -0.00910001"
257
- quat="0.359439 0.359439 0.608937 0.608937"
258
- diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
259
- <joint name="right_follower_joint" class="follower"/>
260
- <geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" material="robotiq_metal" mesh="tongue"/>
261
- <geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
262
- <geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="tongue"/>
263
- <geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
264
- <body name="robotiq_right_pad" pos="-0.0377897 -0.103916 -0.0091" quat="1 -1 0 0">
265
- <geom class="pad_box1" name="right_pad1"/>
266
- <geom class="pad_box2" name="right_pad2"/>
267
- </body>
268
- </body>
269
- </body>
270
- </body>
271
- </body>
272
- </body>
273
- </body>
274
- </body>
275
- </body>
276
- </body>
277
- </body>
278
- </worldbody>
279
-
280
- <!-- UR5e arm actuators -->
281
- <actuator>
282
- <general class="size3" name="shoulder_pan" joint="shoulder_pan_joint"/>
283
- <general class="size3" name="shoulder_lift" joint="shoulder_lift_joint"/>
284
- <general class="size3_limited" name="elbow" joint="elbow_joint"/>
285
- <general class="size1" name="wrist_1" joint="wrist_1_joint"/>
286
- <general class="size1" name="wrist_2" joint="wrist_2_joint"/>
287
- <general class="size1" name="wrist_3" joint="wrist_3_joint"/>
288
- <!-- Robotiq gripper actuator (ctrl 0=open, 255=closed) -->
289
- <general class="2f85" name="fingers_actuator" tendon="split"
290
- forcerange="-5 5" ctrlrange="0 255"
291
- gainprm="0.3137255 0 0" biasprm="0 -100 -10"/>
292
- </actuator>
293
-
294
- <contact>
295
- <exclude body1="robotiq_base" body2="robotiq_left_driver"/>
296
- <exclude body1="robotiq_base" body2="robotiq_right_driver"/>
297
- <exclude body1="robotiq_base" body2="robotiq_left_spring_link"/>
298
- <exclude body1="robotiq_base" body2="robotiq_right_spring_link"/>
299
- <exclude body1="robotiq_right_coupler" body2="robotiq_right_follower"/>
300
- <exclude body1="robotiq_left_coupler" body2="robotiq_left_follower"/>
301
- </contact>
302
-
303
- <tendon>
304
- <fixed name="split">
305
- <joint joint="right_driver_joint" coef="0.485"/>
306
- <joint joint="left_driver_joint" coef="0.485"/>
307
- </fixed>
308
- </tendon>
309
-
310
- <equality>
311
- <connect anchor="-0.0179014 -0.00651468 0.0044"
312
- body1="robotiq_right_follower" body2="robotiq_right_coupler"
313
- solimp="0.95 0.99 0.001" solref="0.005 1"/>
314
- <connect anchor="-0.0179014 -0.00651468 0.0044"
315
- body1="robotiq_left_follower" body2="robotiq_left_coupler"
316
- solimp="0.95 0.99 0.001" solref="0.005 1"/>
317
- <joint joint1="right_driver_joint" joint2="left_driver_joint"
318
- polycoef="0 1 0 0 0" solimp="0.95 0.99 0.001" solref="0.005 1"/>
319
- </equality>
320
-
321
- <keyframe>
322
- <key name="home" qpos="-1.5708 -1.5708 1.5708 -1.5708 -1.5708 0 0 0 0 0 0 0"
323
- ctrl="-1.5708 -1.5708 1.5708 -1.5708 -1.5708 0 0"/>
324
- </keyframe>
325
-
326
- </mujoco>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/urdf_loader.py CHANGED
@@ -164,7 +164,6 @@ _EE_FRAME_CANDIDATES = [
164
  "gripper", # Google Robot (Menagerie site)
165
  "hand", # Franka Panda (standard gripper)
166
  "attachment", # Franka + Robotiq
167
- "attachment_site", # UR5e (mujoco_menagerie)
168
  "wx250s/gripper_link", # WidowX 250S (Menagerie)
169
  "ee_link", # generic
170
  "tool0", # generic industrial
@@ -257,19 +256,6 @@ ROBOT_CONFIGS = {
257
  "finger_joint_names": ["left_finger", "right_finger"],
258
  "camera_body": None,
259
  },
260
- "ur5e": {
261
- "menagerie": "ur5e_robotiq",
262
- "mjcf": "ur5e_robotiq_2f85.xml",
263
- # EE: attachment_site — the UR5e flange site where the Robotiq mounts.
264
- # joint_configs[:, 6] raw UR gripper maps directly to Robotiq ctrl:
265
- # ctrl = raw * 255 (0=open, 255=closed).
266
- "ee_frame": "attachment_site",
267
- "n_arm_joints": 6,
268
- "finger_min": 0.0,
269
- "finger_max": 255.0,
270
- "finger_close_is_max": False,
271
- "camera_body": None,
272
- },
273
 
274
  }
275
 
@@ -344,10 +330,6 @@ def _ensure_robot_assets(robot_name: str) -> Path:
344
  if robot_name == "droid_franka_robotiq":
345
  return _build_droid_franka_robotiq()
346
 
347
- # Composite: UR5e arm + Robotiq 2F-85 gripper (RoboMIND UR setup)
348
- if robot_name == "ur5e_robotiq":
349
- return _build_ur5e_robotiq()
350
-
351
  log.info(f"Downloading {robot_name} assets from mujoco_menagerie...")
352
  _download_menagerie_model(robot_name)
353
  return cached_dir
@@ -420,38 +402,6 @@ def _build_droid_franka_robotiq() -> Path:
420
  return dst
421
 
422
 
423
- def _build_ur5e_robotiq() -> Path:
424
- """Prepare composite UR5e + Robotiq 2F-85 MJCF for RoboMIND UR visualization.
425
-
426
- The hand-tuned composite XML is committed in the repo alongside this
427
- module (``ur5e_robotiq_2f85.xml``). This function downloads the mesh
428
- assets from MuJoCo Menagerie (``universal_robots_ur5e`` +
429
- ``robotiq_2f85_v4``) and copies them together with the XML into a cache
430
- directory that MuJoCo can load from.
431
- """
432
- import shutil
433
-
434
- log.info("Preparing composite UR5e + Robotiq 2F-85 (committed XML + Menagerie assets)...")
435
-
436
- ur5e_dir = _download_menagerie_model("universal_robots_ur5e")
437
- robotiq_dir = _download_menagerie_model("robotiq_2f85_v4")
438
-
439
- dst = _ROBOT_CACHE_DIR / "ur5e_robotiq"
440
- dst.mkdir(parents=True, exist_ok=True)
441
- assets_dir = dst / "assets"
442
- assets_dir.mkdir(exist_ok=True)
443
-
444
- for f in (ur5e_dir / "assets").iterdir():
445
- shutil.copy2(f, assets_dir / f.name)
446
- for f in (robotiq_dir / "assets").iterdir():
447
- shutil.copy2(f, assets_dir / f.name)
448
-
449
- committed_xml = Path(__file__).parent / "ur5e_robotiq_2f85.xml"
450
- shutil.copy2(committed_xml, dst / "ur5e_robotiq_2f85.xml")
451
-
452
- log.info(f"Prepared composite UR5e + Robotiq 2F-85 at {dst}")
453
- return dst
454
-
455
 
456
  # ── Robot loaders ────────────────────────────────────────────────────────────
457
 
@@ -563,35 +513,6 @@ def _load_widowx() -> tuple[list, np.ndarray]:
563
  return meshes, ee_pose
564
 
565
 
566
- def _load_ur5e() -> tuple[list, np.ndarray]:
567
- """Load UR5e + Robotiq 2F-85 composite from MuJoCo Menagerie."""
568
- import mujoco
569
-
570
- mjcf_dir = _ensure_robot_assets("ur5e_robotiq")
571
- mjcf_path = mjcf_dir / "ur5e_robotiq_2f85.xml"
572
- model = mujoco.MjModel.from_xml_path(str(mjcf_path))
573
- data = mujoco.MjData(model)
574
-
575
- # UR5e home: -90, -90, 90, -90, -90, 0 (degrees → radians); gripper open
576
- home_qpos = np.array([-1.5708, -1.5708, 1.5708, -1.5708, -1.5708, 0.0])
577
- data.qpos[: len(home_qpos)] = home_qpos
578
- mujoco.mj_forward(model, data)
579
-
580
- meshes = _extract_mujoco_meshes(model, data)
581
-
582
- # EE pose: use robotiq_base body (the flange-to-gripper attachment point)
583
- ee_pose = np.eye(4, dtype=np.float32)
584
- for i in range(model.nbody):
585
- name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_BODY, i) or ""
586
- if name == "robotiq_base":
587
- ee_pose[:3, 3] = data.xpos[i].astype(np.float32)
588
- ee_pose[:3, :3] = data.xmat[i].reshape(3, 3).astype(np.float32)
589
- break
590
-
591
- log.info(f"UR5e+Robotiq loaded: {len(meshes)} meshes, EE pos={ee_pose[:3, 3]}")
592
- return meshes, ee_pose
593
-
594
-
595
  def _parse_urdf_origin(origin_element) -> np.ndarray:
596
  transform = np.eye(4, dtype=np.float32)
597
  if origin_element is None:
@@ -856,64 +777,4 @@ def get_robot_loaders() -> dict[str, callable]:
856
  "google_robot": _load_google_robot,
857
  "franka_panda": _load_franka_panda,
858
  "widowx": _load_widowx,
859
- "ur5e": _load_ur5e,
860
  }
861
-
862
-
863
- def extract_gripper_openings(unified_57d: np.ndarray, robot_name: str = "google_robot") -> np.ndarray:
864
- """Extract gripper opening fractions from unified action grasp state.
865
-
866
- Uses fingertip spread (f0-f1 distance) to invert the FK and recover
867
- the scalar gripper opening at each timestep.
868
-
869
- Args:
870
- unified_57d: (T, 57) unified action.
871
- robot_name: Robot identifier for FK lookup table.
872
-
873
- Returns:
874
- (T+1,) array of gripper openings in [0, 1].
875
- """
876
- T = unified_57d.shape[0]
877
- grasp = unified_57d[:, 18:33].reshape(T, 5, 3)
878
- all_grasp = np.concatenate([grasp[0:1], grasp], axis=0)
879
-
880
- # Build monotonic inverse lookup from FK (robot-specific)
881
- _gs = np.linspace(0, 1, 10001).astype(np.float32)
882
- try:
883
- if robot_name == "franka_panda":
884
- from cosmos_framework.data.vfm.action.robot_descriptions.franka import franka_fingertip_fk
885
-
886
- _tips = franka_fingertip_fk(_gs)
887
- elif robot_name == "widowx":
888
- from cosmos_framework.data.vfm.action.robot_descriptions.widowx import widowx_fingertip_fk
889
-
890
- _tips = widowx_fingertip_fk(_gs)
891
- elif robot_name == "ur5e":
892
- from cosmos_framework.data.vfm.action.robot_descriptions.umi import _WSG50_MAX_WIDTH, umi_fingertip_fk
893
-
894
- _tips = umi_fingertip_fk(_gs * _WSG50_MAX_WIDTH)
895
- else:
896
- from cosmos_framework.data.vfm.action.robot_descriptions.google_robot import (
897
- google_robot_fingertip_fk_vectorized,
898
- )
899
-
900
- _tips = google_robot_fingertip_fk_vectorized(_gs)
901
- _spreads = np.linalg.norm(_tips[:, 0] - _tips[:, 1], axis=1)
902
- except ImportError:
903
- # Fallback: linear approximation
904
- _spreads = _gs * 0.145
905
- _min_idx = int(np.argmin(_spreads))
906
- mono_gs = _gs[_min_idx:]
907
- mono_spreads = _spreads[_min_idx:]
908
-
909
- openings = np.zeros(T + 1, dtype=np.float32)
910
- for t in range(T + 1):
911
- f0, f1 = all_grasp[t, 0], all_grasp[t, 1]
912
- spread = np.linalg.norm(f0 - f1)
913
- if spread <= mono_spreads[0]:
914
- openings[t] = 0.0
915
- elif spread >= mono_spreads[-1]:
916
- openings[t] = 1.0
917
- else:
918
- openings[t] = float(np.interp(spread, mono_spreads, mono_gs))
919
- return openings
 
164
  "gripper", # Google Robot (Menagerie site)
165
  "hand", # Franka Panda (standard gripper)
166
  "attachment", # Franka + Robotiq
 
167
  "wx250s/gripper_link", # WidowX 250S (Menagerie)
168
  "ee_link", # generic
169
  "tool0", # generic industrial
 
256
  "finger_joint_names": ["left_finger", "right_finger"],
257
  "camera_body": None,
258
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  }
261
 
 
330
  if robot_name == "droid_franka_robotiq":
331
  return _build_droid_franka_robotiq()
332
 
 
 
 
 
333
  log.info(f"Downloading {robot_name} assets from mujoco_menagerie...")
334
  _download_menagerie_model(robot_name)
335
  return cached_dir
 
402
  return dst
403
 
404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
  # ── Robot loaders ────────────────────────────────────────────────────────────
407
 
 
513
  return meshes, ee_pose
514
 
515
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  def _parse_urdf_origin(origin_element) -> np.ndarray:
517
  transform = np.eye(4, dtype=np.float32)
518
  if origin_element is None:
 
777
  "google_robot": _load_google_robot,
778
  "franka_panda": _load_franka_panda,
779
  "widowx": _load_widowx,
 
780
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py CHANGED
@@ -4,11 +4,8 @@
4
 
5
  """Interactive 3D viewer for robot action datasets.
6
 
7
- Uses the unified 57D action representation: every dataset declares one explicit
8
- raw ``ActionFormat`` (9D/10D/20D/57D), which is converted to
9
- ``UnifiedAction(action_57d, mask)`` before rendering.
10
-
11
- **57D layout**: ``[ego(9) | R_wrist(9) | R_fingers(15) | L_wrist(9) | L_fingers(15)]``
12
 
13
  Dependencies::
14
 
@@ -18,8 +15,6 @@ Usage:
18
  # Use each dataset's declared raw action format:
19
  uv run python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --share
20
 
21
- # Override the raw action format explicitly:
22
- uv run python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --action-format 57d --share
23
  """
24
 
25
  from __future__ import annotations
@@ -92,7 +87,6 @@ def _lazycfg_to_entry(
92
  dataset_class = target if isinstance(target, str) else f"{target.__module__}.{target.__qualname__}"
93
  ds_items = ds_cfg.items() if isinstance(ds_cfg, dict) else ds_cfg.items()
94
  dataset_kwargs = {key: value for key, value in ds_items if key != "_target_"}
95
- dataset_kwargs["action_normalization"] = None
96
  if viewer_overrides is not None:
97
  dataset_kwargs.update(viewer_overrides)
98
 
@@ -131,8 +125,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
131
  DATASET_UMI_256,
132
  )
133
 
134
- raw_action_override = {"action_normalization": None}
135
-
136
  from cosmos_framework.data.vfm.action.bridge_orig_lerobot_dataset import _BRIDGE_TO_OPENCV
137
  from cosmos_framework.data.vfm.action.droid_lerobot_dataset import _DROID_TO_OPENCV
138
  from cosmos_framework.data.vfm.action.fractal import _GOOGLE_ROBOT_TO_OPENCV
@@ -147,7 +139,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
147
  max_finger_width=0.0,
148
  fps=10,
149
  action_format=ActionFormat.EGO_9D,
150
- viewer_overrides=raw_action_override,
151
  ),
152
  "fractal": _lazycfg_to_entry(
153
  DATASET_FRACTAL_256,
@@ -158,7 +149,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
158
  camera_fov_deg=69.0,
159
  camera_aspect=320 / 256,
160
  to_opencv=_GOOGLE_ROBOT_TO_OPENCV,
161
- viewer_overrides=raw_action_override,
162
  ),
163
  "bridge": _lazycfg_to_entry(
164
  DATASET_BRIDGE_480,
@@ -167,7 +157,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
167
  fps=5,
168
  action_format=ActionFormat.SINGLE_ARM_10D,
169
  to_opencv=_BRIDGE_TO_OPENCV,
170
- viewer_overrides=raw_action_override,
171
  ),
172
  "droid": _lazycfg_to_entry(
173
  DATASET_DROID_480,
@@ -176,7 +165,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
176
  fps=15,
177
  action_format=ActionFormat.SINGLE_ARM_10D,
178
  to_opencv=_DROID_TO_OPENCV,
179
- viewer_overrides=raw_action_override,
180
  ),
181
  "umi": _lazycfg_to_entry(
182
  DATASET_UMI_256,
@@ -184,7 +172,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
184
  max_finger_width=0.0,
185
  fps=20,
186
  action_format=ActionFormat.SINGLE_ARM_10D,
187
- viewer_overrides=raw_action_override,
188
  ),
189
  "robomind_franka": _lazycfg_to_entry(
190
  DATASET_ROBOMIND_FRANKA_480,
@@ -193,7 +180,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
193
  fps=10,
194
  action_format=ActionFormat.SINGLE_ARM_10D,
195
  to_opencv=_FRANKA_TO_OPENCV,
196
- viewer_overrides=raw_action_override,
197
  ),
198
  "robomind_franka_dual": _lazycfg_to_entry(
199
  DATASET_ROBOMIND_FRANKA_DUAL_480,
@@ -210,7 +196,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
210
  dtype=np.float32,
211
  ),
212
  to_opencv=_FRANKA_TO_OPENCV,
213
- viewer_overrides=raw_action_override,
214
  ),
215
  }
216
 
@@ -238,7 +223,7 @@ def _create_dataset(entry: DatasetEntry, chunk_length: int):
238
 
239
  # UMI: factory function
240
  if callable(cls) and not inspect.isclass(cls):
241
- _OMEGACONF_BLOCKLIST = {"chunk_length", "split", "action_normalization", "enable_fast_init"}
242
  kwargs = {k: v for k, v in kwargs.items() if k not in _OMEGACONF_BLOCKLIST}
243
  kwargs["eager_load"] = True
244
  return cls(**kwargs)
@@ -312,17 +297,6 @@ def _build_viewer_idle_action_spec(action_format: ActionFormat) -> Any:
312
  Rot("rot6d", prefix="right"),
313
  Gripper(prefix="right"),
314
  )
315
- if action_format is ActionFormat.UNIFIED_57D:
316
- return build_action_spec(
317
- Pos(prefix="ego"),
318
- Rot("rot6d", prefix="ego"),
319
- Pos(prefix="right_wrist"),
320
- Rot("rot6d", prefix="right_wrist"),
321
- Pos(dim=15, prefix="right_fingers"),
322
- Pos(prefix="left_wrist"),
323
- Rot("rot6d", prefix="left_wrist"),
324
- Pos(dim=15, prefix="left_fingers"),
325
- )
326
  raise ValueError(f"Unsupported action format for idle-frame detection: {action_format}")
327
 
328
 
@@ -622,7 +596,7 @@ def launch_viewer(
622
  cam_panel = client.gui.add_image(np.zeros((64, 64, 3), dtype=np.uint8))
623
  renderer.set_video_panel(cam_panel)
624
 
625
- with client.gui.add_folder("Action (57D)"):
626
  action_text = client.gui.add_markdown("*No episode loaded*")
627
 
628
  show = {
@@ -645,7 +619,7 @@ def launch_viewer(
645
  )
646
 
647
  def _update_action_text(t: int) -> None:
648
- """Update the 57D action display for one client."""
649
  txt = renderer.format_action_text(t)
650
  action_text.content = txt if txt else "*No data*"
651
 
@@ -780,14 +754,6 @@ def launch_viewer(
780
  )
781
  state.video = get_video_from_sample(sample)
782
 
783
- # Inject FK joint configs when the dataset provides them (e.g. UR).
784
- jc = sample.get("joint_configs")
785
- if jc is not None:
786
- state.joint_configs = (
787
- jc.numpy().astype(np.float32)
788
- if isinstance(jc, torch.Tensor)
789
- else np.asarray(jc, dtype=np.float32)
790
- )
791
  status_text.content = "⏳ Loading robot animation..."
792
  renderer.load(state, entry, to_opencv=to_opencv)
793
  _rebuild_robot_frame_toggles()
@@ -805,7 +771,7 @@ def launch_viewer(
805
  + (f"Task: {ai_caption_text}\n\n" if ai_caption_text else "")
806
  + (f"Debug: {debug_caption_text}\n\n" if debug_caption_text else "")
807
  + (
808
- f"Steps: {T} | Raw: {raw_action_label} ({action_raw.shape[-1]}D) → 57D | "
809
  f"Robot: {entry.robot_name or '—'} | FPS: {entry.fps}"
810
  )
811
  )
@@ -947,7 +913,7 @@ def launch_viewer(
947
 
948
 
949
  def main():
950
- parser = argparse.ArgumentParser(description="Action dataset viewer (unified 57D)")
951
  parser.add_argument("--port", type=int, default=8013)
952
  parser.add_argument("--share", action="store_true")
953
  parser.add_argument("--chunk-length", type=int, default=16)
 
4
 
5
  """Interactive 3D viewer for robot action datasets.
6
 
7
+ Uses raw release action layouts: 9D ego/camera, 10D single-arm, and 20D dual-arm.
8
+ Each sample is converted to a small internal render state before drawing.
 
 
 
9
 
10
  Dependencies::
11
 
 
15
  # Use each dataset's declared raw action format:
16
  uv run python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --share
17
 
 
 
18
  """
19
 
20
  from __future__ import annotations
 
87
  dataset_class = target if isinstance(target, str) else f"{target.__module__}.{target.__qualname__}"
88
  ds_items = ds_cfg.items() if isinstance(ds_cfg, dict) else ds_cfg.items()
89
  dataset_kwargs = {key: value for key, value in ds_items if key != "_target_"}
 
90
  if viewer_overrides is not None:
91
  dataset_kwargs.update(viewer_overrides)
92
 
 
125
  DATASET_UMI_256,
126
  )
127
 
 
 
128
  from cosmos_framework.data.vfm.action.bridge_orig_lerobot_dataset import _BRIDGE_TO_OPENCV
129
  from cosmos_framework.data.vfm.action.droid_lerobot_dataset import _DROID_TO_OPENCV
130
  from cosmos_framework.data.vfm.action.fractal import _GOOGLE_ROBOT_TO_OPENCV
 
139
  max_finger_width=0.0,
140
  fps=10,
141
  action_format=ActionFormat.EGO_9D,
 
142
  ),
143
  "fractal": _lazycfg_to_entry(
144
  DATASET_FRACTAL_256,
 
149
  camera_fov_deg=69.0,
150
  camera_aspect=320 / 256,
151
  to_opencv=_GOOGLE_ROBOT_TO_OPENCV,
 
152
  ),
153
  "bridge": _lazycfg_to_entry(
154
  DATASET_BRIDGE_480,
 
157
  fps=5,
158
  action_format=ActionFormat.SINGLE_ARM_10D,
159
  to_opencv=_BRIDGE_TO_OPENCV,
 
160
  ),
161
  "droid": _lazycfg_to_entry(
162
  DATASET_DROID_480,
 
165
  fps=15,
166
  action_format=ActionFormat.SINGLE_ARM_10D,
167
  to_opencv=_DROID_TO_OPENCV,
 
168
  ),
169
  "umi": _lazycfg_to_entry(
170
  DATASET_UMI_256,
 
172
  max_finger_width=0.0,
173
  fps=20,
174
  action_format=ActionFormat.SINGLE_ARM_10D,
 
175
  ),
176
  "robomind_franka": _lazycfg_to_entry(
177
  DATASET_ROBOMIND_FRANKA_480,
 
180
  fps=10,
181
  action_format=ActionFormat.SINGLE_ARM_10D,
182
  to_opencv=_FRANKA_TO_OPENCV,
 
183
  ),
184
  "robomind_franka_dual": _lazycfg_to_entry(
185
  DATASET_ROBOMIND_FRANKA_DUAL_480,
 
196
  dtype=np.float32,
197
  ),
198
  to_opencv=_FRANKA_TO_OPENCV,
 
199
  ),
200
  }
201
 
 
223
 
224
  # UMI: factory function
225
  if callable(cls) and not inspect.isclass(cls):
226
+ _OMEGACONF_BLOCKLIST = {"chunk_length", "split", "enable_fast_init"}
227
  kwargs = {k: v for k, v in kwargs.items() if k not in _OMEGACONF_BLOCKLIST}
228
  kwargs["eager_load"] = True
229
  return cls(**kwargs)
 
297
  Rot("rot6d", prefix="right"),
298
  Gripper(prefix="right"),
299
  )
 
 
 
 
 
 
 
 
 
 
 
300
  raise ValueError(f"Unsupported action format for idle-frame detection: {action_format}")
301
 
302
 
 
596
  cam_panel = client.gui.add_image(np.zeros((64, 64, 3), dtype=np.uint8))
597
  renderer.set_video_panel(cam_panel)
598
 
599
+ with client.gui.add_folder("Action"):
600
  action_text = client.gui.add_markdown("*No episode loaded*")
601
 
602
  show = {
 
619
  )
620
 
621
  def _update_action_text(t: int) -> None:
622
+ """Update the action display for one client."""
623
  txt = renderer.format_action_text(t)
624
  action_text.content = txt if txt else "*No data*"
625
 
 
754
  )
755
  state.video = get_video_from_sample(sample)
756
 
 
 
 
 
 
 
 
 
757
  status_text.content = "⏳ Loading robot animation..."
758
  renderer.load(state, entry, to_opencv=to_opencv)
759
  _rebuild_robot_frame_toggles()
 
771
  + (f"Task: {ai_caption_text}\n\n" if ai_caption_text else "")
772
  + (f"Debug: {debug_caption_text}\n\n" if debug_caption_text else "")
773
  + (
774
+ f"Steps: {T} | Raw action: {raw_action_label} ({action_raw.shape[-1]}D) | "
775
  f"Robot: {entry.robot_name or '—'} | FPS: {entry.fps}"
776
  )
777
  )
 
913
 
914
 
915
  def main():
916
+ parser = argparse.ArgumentParser(description="Action dataset viewer")
917
  parser.add_argument("--port", type=int, default=8013)
918
  parser.add_argument("--share", action="store_true")
919
  parser.add_argument("--chunk-length", type=int, default=16)