Spaces:
Running
Running
Clean raw-action viewer release
Browse files- cosmos-framework/cosmos_framework/data/vfm/action/action_normalization.py +0 -49
- cosmos-framework/cosmos_framework/data/vfm/action/bridge_orig_lerobot_dataset.py +0 -4
- cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py +4 -97
- cosmos-framework/cosmos_framework/data/vfm/action/domain_utils.py +0 -3
- cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset.py +2 -5
- cosmos-framework/cosmos_framework/data/vfm/action/fractal.py +0 -4
- cosmos-framework/cosmos_framework/data/vfm/action/libero_dataset.py +0 -611
- cosmos-framework/cosmos_framework/data/vfm/action/libero_pose_utils.py +0 -69
- cosmos-framework/cosmos_framework/data/vfm/action/normalizers/bridge_orig_lerobot_backward_framewise_rot6d.json +0 -33
- cosmos-framework/cosmos_framework/data/vfm/action/normalizers/droid_lerobot_backward_framewise_rot6d.json +0 -33
- cosmos-framework/cosmos_framework/data/vfm/action/normalizers/fractal_backward_framewise_rot6d.json +0 -33
- cosmos-framework/cosmos_framework/data/vfm/action/normalizers/libero_native_frame_wise_relative_rot6d.json +0 -37
- cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka-dual_backward_framewise_rot6d.json +0 -33
- cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka_backward_framewise_rot6d.json +0 -33
- cosmos-framework/cosmos_framework/data/vfm/action/robomind_franka_dataset.py +0 -3
- cosmos-framework/cosmos_framework/data/vfm/action/umi_lerobot_dataset.py +1 -3
- cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/README.md +12 -28
- cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py +0 -6
- cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ik_solver.py +2 -2
- cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_action.py +68 -326
- cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_renderer.py +6 -96
- cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ur5e_robotiq_2f85.xml +0 -326
- cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/urdf_loader.py +0 -139
- cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py +7 -41
cosmos-framework/cosmos_framework/data/vfm/action/action_normalization.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
| 1 |
-
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
-
# SPDX-License-Identifier: OpenMDW-1.1
|
| 3 |
-
|
| 4 |
-
"""Action normalization helpers."""
|
| 5 |
-
|
| 6 |
-
import json
|
| 7 |
-
from pathlib import Path
|
| 8 |
-
|
| 9 |
-
import numpy as np
|
| 10 |
-
import torch
|
| 11 |
-
|
| 12 |
-
from cosmos_framework.utils import log
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def load_action_stats(stats_path: str, stats_key: str = "global") -> dict[str, np.ndarray]:
|
| 16 |
-
"""Load pre-computed action normalization stats from a JSON file."""
|
| 17 |
-
path = Path(stats_path)
|
| 18 |
-
if not path.exists():
|
| 19 |
-
raise FileNotFoundError(f"Action normalization stats not found at {stats_path}.")
|
| 20 |
-
log.info(f"Loading action normalization stats from {stats_path}")
|
| 21 |
-
with path.open("r") as f:
|
| 22 |
-
raw = json.load(f)
|
| 23 |
-
if stats_key in raw:
|
| 24 |
-
raw = raw[stats_key]
|
| 25 |
-
if not isinstance(raw, dict):
|
| 26 |
-
raise TypeError(f"Action normalization stats block {stats_key!r} in {stats_path} must be a dict.")
|
| 27 |
-
elif stats_key != "global":
|
| 28 |
-
raise KeyError(f"Action normalization stats block {stats_key!r} not found in {stats_path}.")
|
| 29 |
-
stat_keys = {"mean", "std", "min", "max", "q01", "q99"}
|
| 30 |
-
return {k: np.array(v, dtype=np.float32) for k, v in raw.items() if k in stat_keys}
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def normalize_action(
|
| 34 |
-
action: torch.Tensor,
|
| 35 |
-
method: str,
|
| 36 |
-
stats: dict[str, torch.Tensor],
|
| 37 |
-
) -> torch.Tensor:
|
| 38 |
-
"""Normalize action tensor (all dimensions including gripper)."""
|
| 39 |
-
if method == "quantile":
|
| 40 |
-
q01, q99 = stats["q01"], stats["q99"]
|
| 41 |
-
denom = (q99 - q01).clamp(min=1e-8)
|
| 42 |
-
return (2.0 * (action - q01) / denom - 1.0).clamp(-1.0, 1.0)
|
| 43 |
-
if method == "meanstd":
|
| 44 |
-
return (action - stats["mean"]) / stats["std"].clamp(min=1e-8)
|
| 45 |
-
if method == "minmax":
|
| 46 |
-
lo, hi = stats["min"], stats["max"]
|
| 47 |
-
denom = (hi - lo).clamp(min=1e-8)
|
| 48 |
-
return (2.0 * (action - lo) / denom - 1.0).clamp(-1.0, 1.0)
|
| 49 |
-
raise ValueError(f"Unknown normalization method: {method!r}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosmos-framework/cosmos_framework/data/vfm/action/bridge_orig_lerobot_dataset.py
CHANGED
|
@@ -18,7 +18,6 @@ from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
|
|
| 18 |
|
| 19 |
from cosmos_framework.utils import log
|
| 20 |
from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
|
| 21 |
-
ActionNormalization,
|
| 22 |
ActionSpec,
|
| 23 |
BaseActionLeRobotDataset,
|
| 24 |
Gripper,
|
|
@@ -101,7 +100,6 @@ class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 101 |
split: str = "train",
|
| 102 |
mode: str = "policy",
|
| 103 |
pose_convention: PoseConvention = "backward_framewise",
|
| 104 |
-
action_normalization: ActionNormalization | None = None,
|
| 105 |
viewpoint: Viewpoint = "ego_view",
|
| 106 |
enable_fast_init: bool = False,
|
| 107 |
) -> None:
|
|
@@ -117,7 +115,6 @@ class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 117 |
viewpoint=viewpoint,
|
| 118 |
pose_convention=pose_convention,
|
| 119 |
rotation_format="rot6d",
|
| 120 |
-
action_normalization=action_normalization,
|
| 121 |
tolerance_s=1e-4,
|
| 122 |
enable_fast_init=enable_fast_init,
|
| 123 |
)
|
|
@@ -208,7 +205,6 @@ class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 208 |
# ------------------------------------------------------------------
|
| 209 |
# Normalization is handled by BaseActionLeRobotDataset.
|
| 210 |
# Stats are loaded from:
|
| 211 |
-
# cosmos_framework/data/vfm/action/normalizers/
|
| 212 |
# bridge_orig_lerobot_<pose_convention>_<rotation_format>.json
|
| 213 |
# Regenerate via ``compute_action_stats.py`` + ``debug/stats_all.sh``.
|
| 214 |
# ------------------------------------------------------------------
|
|
|
|
| 18 |
|
| 19 |
from cosmos_framework.utils import log
|
| 20 |
from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
|
|
|
|
| 21 |
ActionSpec,
|
| 22 |
BaseActionLeRobotDataset,
|
| 23 |
Gripper,
|
|
|
|
| 100 |
split: str = "train",
|
| 101 |
mode: str = "policy",
|
| 102 |
pose_convention: PoseConvention = "backward_framewise",
|
|
|
|
| 103 |
viewpoint: Viewpoint = "ego_view",
|
| 104 |
enable_fast_init: bool = False,
|
| 105 |
) -> None:
|
|
|
|
| 115 |
viewpoint=viewpoint,
|
| 116 |
pose_convention=pose_convention,
|
| 117 |
rotation_format="rot6d",
|
|
|
|
| 118 |
tolerance_s=1e-4,
|
| 119 |
enable_fast_init=enable_fast_init,
|
| 120 |
)
|
|
|
|
| 205 |
# ------------------------------------------------------------------
|
| 206 |
# Normalization is handled by BaseActionLeRobotDataset.
|
| 207 |
# Stats are loaded from:
|
|
|
|
| 208 |
# bridge_orig_lerobot_<pose_convention>_<rotation_format>.json
|
| 209 |
# Regenerate via ``compute_action_stats.py`` + ``debug/stats_all.sh``.
|
| 210 |
# ------------------------------------------------------------------
|
cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py
CHANGED
|
@@ -21,9 +21,8 @@ from bisect import bisect_right
|
|
| 21 |
from collections import OrderedDict, defaultdict
|
| 22 |
from collections.abc import Callable, Sequence
|
| 23 |
from concurrent.futures import ThreadPoolExecutor
|
| 24 |
-
from pathlib import Path
|
| 25 |
from threading import Lock
|
| 26 |
-
from typing import Any,
|
| 27 |
|
| 28 |
import huggingface_hub.constants as _hf_const
|
| 29 |
import numpy as np
|
|
@@ -57,11 +56,6 @@ def _ensure_hf_hub_offline() -> None:
|
|
| 57 |
from functools import cached_property
|
| 58 |
|
| 59 |
from cosmos_framework.utils import log
|
| 60 |
-
from cosmos_framework.data.vfm.action.action_normalization import (
|
| 61 |
-
load_action_stats,
|
| 62 |
-
normalize_action,
|
| 63 |
-
)
|
| 64 |
-
|
| 65 |
# Re-export the action_spec DSL from this module so that subclass datasets
|
| 66 |
# only need a single import block (alongside ``BaseActionLeRobotDataset``).
|
| 67 |
from cosmos_framework.data.vfm.action.action_spec import ( # noqa: F401 (re-export)
|
|
@@ -96,8 +90,6 @@ from cosmos_framework.data.vfm.action_scripts.memprofile import (
|
|
| 96 |
# ---------------------------------------------------------------------------
|
| 97 |
_LRU_VIDEO_CACHE_MAX_SIZE: int = 64
|
| 98 |
_LRU_DATASET_MAX_LOADED: int = 32
|
| 99 |
-
ActionNormalization = Literal["quantile", "quantile_rot", "meanstd", "minmax"]
|
| 100 |
-
_ACTION_NORMALIZATION_CHOICES: tuple[str, ...] = ("quantile", "quantile_rot", "meanstd", "minmax")
|
| 101 |
|
| 102 |
_decoder_cache_patched = False
|
| 103 |
|
|
@@ -290,14 +282,6 @@ class BaseActionLeRobotDataset(Dataset):
|
|
| 290 |
# Applied as: R_opencv = R_native @ _to_opencv
|
| 291 |
# Subclasses override in __init__; default is identity (no correction).
|
| 292 |
|
| 293 |
-
# Bundled normalization stats directory. Stats are committed at
|
| 294 |
-
# ``<_NORMALIZERS_DIR>/<embodiment>_<pose>_<rotation_format>.json`` (flat
|
| 295 |
-
# layout matching the existing UMI files) and produced by
|
| 296 |
-
# ``projects/cosmos3/vfm/datasets/action/compute_action_stats.py``.
|
| 297 |
-
# Subclasses that need a different filename scheme can override
|
| 298 |
-
# :meth:`_normalizer_filename`.
|
| 299 |
-
_NORMALIZERS_DIR: ClassVar[Path] = Path(__file__).parent / "normalizers"
|
| 300 |
-
|
| 301 |
def __init__(
|
| 302 |
self,
|
| 303 |
*,
|
|
@@ -311,7 +295,6 @@ class BaseActionLeRobotDataset(Dataset):
|
|
| 311 |
viewpoint: Viewpoint,
|
| 312 |
pose_convention: str | None = None,
|
| 313 |
rotation_format: str | None = None,
|
| 314 |
-
action_normalization: ActionNormalization | None = None,
|
| 315 |
tolerance_s: float = 1e-4,
|
| 316 |
max_loaded_datasets: int = _LRU_DATASET_MAX_LOADED,
|
| 317 |
skip_video_loading: bool = False,
|
|
@@ -326,10 +309,6 @@ class BaseActionLeRobotDataset(Dataset):
|
|
| 326 |
|
| 327 |
assert sample_stride >= 1, f"sample_stride must be >= 1, got {sample_stride}"
|
| 328 |
assert fast_init_max_workers >= 1, f"fast_init_max_workers must be >= 1, got {fast_init_max_workers}"
|
| 329 |
-
assert action_normalization is None or action_normalization in _ACTION_NORMALIZATION_CHOICES, (
|
| 330 |
-
f"action_normalization must be None or one of {_ACTION_NORMALIZATION_CHOICES}, got {action_normalization!r}"
|
| 331 |
-
)
|
| 332 |
-
|
| 333 |
with rss_tracker(f"{self.__class__.__name__}.__init__", enabled=self._memprofile):
|
| 334 |
self._fps = fps
|
| 335 |
self._dt = 1.0 / fps
|
|
@@ -342,10 +321,6 @@ class BaseActionLeRobotDataset(Dataset):
|
|
| 342 |
self._viewpoint: Viewpoint = viewpoint
|
| 343 |
self._pose_convention = pose_convention
|
| 344 |
self._rotation_format = rotation_format
|
| 345 |
-
self._action_normalization = action_normalization
|
| 346 |
-
# Lazy-loaded stats cache, populated on first call to
|
| 347 |
-
# :meth:`_normalize_action`. Per-process (workers get their own).
|
| 348 |
-
self._norm_stats: dict[str, torch.Tensor] | None = None
|
| 349 |
self._tolerance_s = tolerance_s
|
| 350 |
self._max_loaded_datasets = max_loaded_datasets
|
| 351 |
self._skip_video_loading = skip_video_loading
|
|
@@ -728,74 +703,6 @@ class BaseActionLeRobotDataset(Dataset):
|
|
| 728 |
|
| 729 |
return mode, dataset_idx, row_idx, sample
|
| 730 |
|
| 731 |
-
# -- action normalization ------------------------------------------------
|
| 732 |
-
|
| 733 |
-
def _normalizer_filename(self) -> str:
|
| 734 |
-
"""Bundled stats filename for this dataset instance.
|
| 735 |
-
|
| 736 |
-
Default convention (matches ``compute_action_stats.py`` output):
|
| 737 |
-
``<embodiment_type>[_<pose_convention>][_<rotation_format>].json``.
|
| 738 |
-
|
| 739 |
-
Pose/rotation suffixes are appended only when the instance actually
|
| 740 |
-
has them (SE(3) pose datasets like Bridge / DROID). Joint-space
|
| 741 |
-
datasets — where both are ``None`` — resolve to just
|
| 742 |
-
``<embodiment_type>.json``.
|
| 743 |
-
|
| 744 |
-
Subclasses may override when the bundled filename uses a different
|
| 745 |
-
scheme (e.g. UMI's ``uva_umi_single_task_normalizer.json``).
|
| 746 |
-
"""
|
| 747 |
-
if not self._embodiment_type:
|
| 748 |
-
raise RuntimeError(
|
| 749 |
-
f"{self.__class__.__name__}: embodiment_type is not set; cannot resolve normalizer filename."
|
| 750 |
-
)
|
| 751 |
-
parts = [self._embodiment_type]
|
| 752 |
-
if self._pose_convention:
|
| 753 |
-
parts.append(self._pose_convention)
|
| 754 |
-
if self._rotation_format:
|
| 755 |
-
parts.append(self._rotation_format)
|
| 756 |
-
return "_".join(parts) + ".json"
|
| 757 |
-
|
| 758 |
-
def _normalizer_path(self) -> Path:
|
| 759 |
-
"""Full path to the bundled stats JSON for this dataset."""
|
| 760 |
-
return self._NORMALIZERS_DIR / self._normalizer_filename()
|
| 761 |
-
|
| 762 |
-
def _load_norm_stats(self) -> dict[str, torch.Tensor]:
|
| 763 |
-
"""Lazy-load action normalization stats (once per worker process).
|
| 764 |
-
|
| 765 |
-
Raises :class:`FileNotFoundError` if the stats file is missing. This
|
| 766 |
-
is intentional — silently falling back to identity normalization when
|
| 767 |
-
the user asked for ``quantile`` / ``quantile_rot`` / ``meanstd`` /
|
| 768 |
-
``minmax`` would be a training bug.
|
| 769 |
-
"""
|
| 770 |
-
if self._norm_stats is not None:
|
| 771 |
-
return self._norm_stats
|
| 772 |
-
stats_key = "global_raw" if self._action_normalization == "quantile_rot" else "global"
|
| 773 |
-
raw = load_action_stats(str(self._normalizer_path()), stats_key=stats_key)
|
| 774 |
-
self._norm_stats = {}
|
| 775 |
-
for key, value in raw.items():
|
| 776 |
-
self._norm_stats[key] = torch.from_numpy(value).float() # [D]
|
| 777 |
-
return self._norm_stats
|
| 778 |
-
|
| 779 |
-
def _normalize_action(self, action: torch.Tensor) -> torch.Tensor:
|
| 780 |
-
"""Apply the configured normalization, or return the raw action.
|
| 781 |
-
|
| 782 |
-
- ``action_normalization=None`` → pass-through (used by viewer / debug)
|
| 783 |
-
- ``"quantile"`` → ``2·(x − q01) / (q99 − q01) − 1`` clamped to [-1, 1]
|
| 784 |
-
- ``"quantile_rot"`` → same as ``"quantile"``, but using ``global_raw``
|
| 785 |
-
stats so rotation dimensions are normalized too.
|
| 786 |
-
- ``"meanstd"`` → ``(x − mean) / std``
|
| 787 |
-
- ``"minmax"`` → ``2·(x − min) / (max − min) − 1`` clamped to [-1, 1]
|
| 788 |
-
"""
|
| 789 |
-
if self._action_normalization is None:
|
| 790 |
-
return action
|
| 791 |
-
method = "quantile" if self._action_normalization == "quantile_rot" else self._action_normalization
|
| 792 |
-
normalized_action = normalize_action(
|
| 793 |
-
action,
|
| 794 |
-
method,
|
| 795 |
-
self._load_norm_stats(),
|
| 796 |
-
) # [T,D]
|
| 797 |
-
return normalized_action
|
| 798 |
-
|
| 799 |
# -- video formatting ----------------------------------------------------
|
| 800 |
|
| 801 |
def _convert_video(self, video_tchw: torch.Tensor | None) -> torch.Tensor | None:
|
|
@@ -989,9 +896,9 @@ class BaseActionLeRobotDataset(Dataset):
|
|
| 989 |
if idle_frames is not None:
|
| 990 |
extras = {"idle_frames": idle_frames, **extras}
|
| 991 |
|
| 992 |
-
|
| 993 |
if self._skip_video_loading:
|
| 994 |
-
result: dict[str, Any] = {"action":
|
| 995 |
if "idle_frames" in extras:
|
| 996 |
result["idle_frames"] = extras["idle_frames"]
|
| 997 |
return result
|
|
@@ -999,7 +906,7 @@ class BaseActionLeRobotDataset(Dataset):
|
|
| 999 |
return {
|
| 1000 |
"ai_caption": ai_caption,
|
| 1001 |
"video": formatted_video,
|
| 1002 |
-
"action":
|
| 1003 |
"conditioning_fps": torch.tensor(self._fps, dtype=torch.long),
|
| 1004 |
"mode": mode,
|
| 1005 |
"domain_id": torch.tensor(self._domain_id, dtype=torch.long),
|
|
|
|
| 21 |
from collections import OrderedDict, defaultdict
|
| 22 |
from collections.abc import Callable, Sequence
|
| 23 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
| 24 |
from threading import Lock
|
| 25 |
+
from typing import Any, Literal
|
| 26 |
|
| 27 |
import huggingface_hub.constants as _hf_const
|
| 28 |
import numpy as np
|
|
|
|
| 56 |
from functools import cached_property
|
| 57 |
|
| 58 |
from cosmos_framework.utils import log
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
# Re-export the action_spec DSL from this module so that subclass datasets
|
| 60 |
# only need a single import block (alongside ``BaseActionLeRobotDataset``).
|
| 61 |
from cosmos_framework.data.vfm.action.action_spec import ( # noqa: F401 (re-export)
|
|
|
|
| 90 |
# ---------------------------------------------------------------------------
|
| 91 |
_LRU_VIDEO_CACHE_MAX_SIZE: int = 64
|
| 92 |
_LRU_DATASET_MAX_LOADED: int = 32
|
|
|
|
|
|
|
| 93 |
|
| 94 |
_decoder_cache_patched = False
|
| 95 |
|
|
|
|
| 282 |
# Applied as: R_opencv = R_native @ _to_opencv
|
| 283 |
# Subclasses override in __init__; default is identity (no correction).
|
| 284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
def __init__(
|
| 286 |
self,
|
| 287 |
*,
|
|
|
|
| 295 |
viewpoint: Viewpoint,
|
| 296 |
pose_convention: str | None = None,
|
| 297 |
rotation_format: str | None = None,
|
|
|
|
| 298 |
tolerance_s: float = 1e-4,
|
| 299 |
max_loaded_datasets: int = _LRU_DATASET_MAX_LOADED,
|
| 300 |
skip_video_loading: bool = False,
|
|
|
|
| 309 |
|
| 310 |
assert sample_stride >= 1, f"sample_stride must be >= 1, got {sample_stride}"
|
| 311 |
assert fast_init_max_workers >= 1, f"fast_init_max_workers must be >= 1, got {fast_init_max_workers}"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
with rss_tracker(f"{self.__class__.__name__}.__init__", enabled=self._memprofile):
|
| 313 |
self._fps = fps
|
| 314 |
self._dt = 1.0 / fps
|
|
|
|
| 321 |
self._viewpoint: Viewpoint = viewpoint
|
| 322 |
self._pose_convention = pose_convention
|
| 323 |
self._rotation_format = rotation_format
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
self._tolerance_s = tolerance_s
|
| 325 |
self._max_loaded_datasets = max_loaded_datasets
|
| 326 |
self._skip_video_loading = skip_video_loading
|
|
|
|
| 703 |
|
| 704 |
return mode, dataset_idx, row_idx, sample
|
| 705 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 706 |
# -- video formatting ----------------------------------------------------
|
| 707 |
|
| 708 |
def _convert_video(self, video_tchw: torch.Tensor | None) -> torch.Tensor | None:
|
|
|
|
| 896 |
if idle_frames is not None:
|
| 897 |
extras = {"idle_frames": idle_frames, **extras}
|
| 898 |
|
| 899 |
+
raw_action = action # [T,D]
|
| 900 |
if self._skip_video_loading:
|
| 901 |
+
result: dict[str, Any] = {"action": raw_action}
|
| 902 |
if "idle_frames" in extras:
|
| 903 |
result["idle_frames"] = extras["idle_frames"]
|
| 904 |
return result
|
|
|
|
| 906 |
return {
|
| 907 |
"ai_caption": ai_caption,
|
| 908 |
"video": formatted_video,
|
| 909 |
+
"action": raw_action,
|
| 910 |
"conditioning_fps": torch.tensor(self._fps, dtype=torch.long),
|
| 911 |
"mode": mode,
|
| 912 |
"domain_id": torch.tensor(self._domain_id, dtype=torch.long),
|
cosmos-framework/cosmos_framework/data/vfm/action/domain_utils.py
CHANGED
|
@@ -7,16 +7,13 @@ EMBODIMENT_TO_DOMAIN_ID: dict[str, int] = {
|
|
| 7 |
"no_action": 0,
|
| 8 |
"av": 1,
|
| 9 |
"camera_pose": 2,
|
| 10 |
-
"hand_pose": 3,
|
| 11 |
"pusht": 4,
|
| 12 |
-
"libero": 5,
|
| 13 |
"umi": 6,
|
| 14 |
"bridge_orig_lerobot": 7,
|
| 15 |
"droid_lerobot": 8,
|
| 16 |
"robomind-franka": 8, # Both Droid and RoboMIND-Franka are using robotiq and franka
|
| 17 |
"embodiment_b": 9,
|
| 18 |
"robomind-franka-dual": 12,
|
| 19 |
-
"robomind-ur": 13,
|
| 20 |
"fractal": 20,
|
| 21 |
}
|
| 22 |
|
|
|
|
| 7 |
"no_action": 0,
|
| 8 |
"av": 1,
|
| 9 |
"camera_pose": 2,
|
|
|
|
| 10 |
"pusht": 4,
|
|
|
|
| 11 |
"umi": 6,
|
| 12 |
"bridge_orig_lerobot": 7,
|
| 13 |
"droid_lerobot": 8,
|
| 14 |
"robomind-franka": 8, # Both Droid and RoboMIND-Franka are using robotiq and franka
|
| 15 |
"embodiment_b": 9,
|
| 16 |
"robomind-franka-dual": 12,
|
|
|
|
| 17 |
"fractal": 20,
|
| 18 |
}
|
| 19 |
|
cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset.py
CHANGED
|
@@ -11,7 +11,6 @@ from scipy.spatial.transform import Rotation as R
|
|
| 11 |
|
| 12 |
from cosmos_framework.utils import log
|
| 13 |
from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
|
| 14 |
-
ActionNormalization,
|
| 15 |
ActionSpec,
|
| 16 |
BaseActionLeRobotDataset,
|
| 17 |
Gripper,
|
|
@@ -69,7 +68,6 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 69 |
split: str = "train",
|
| 70 |
mode: str = "policy",
|
| 71 |
pose_convention: PoseConvention = "backward_framewise",
|
| 72 |
-
action_normalization: ActionNormalization | None = None,
|
| 73 |
tolerance_s=2e-4,
|
| 74 |
viewpoint: Viewpoint = "concat_view",
|
| 75 |
use_success_only: bool = False,
|
|
@@ -93,7 +91,6 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 93 |
viewpoint=viewpoint,
|
| 94 |
pose_convention=pose_convention,
|
| 95 |
rotation_format="rot6d",
|
| 96 |
-
action_normalization=action_normalization,
|
| 97 |
tolerance_s=tolerance_s,
|
| 98 |
enable_fast_init=enable_fast_init,
|
| 99 |
)
|
|
@@ -398,7 +395,7 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 398 |
axis=-1,
|
| 399 |
)
|
| 400 |
).float()
|
| 401 |
-
extras["history_action"] =
|
| 402 |
if self._use_state:
|
| 403 |
initial_gripper = sample[_GRIPPER_STATE_FEATURE][0].unsqueeze(-1)
|
| 404 |
if self._is_gripper_action_flipped:
|
|
@@ -449,7 +446,7 @@ class DROIDLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 449 |
if self._is_gripper_action_flipped:
|
| 450 |
hist_gripper = 1.0 - hist_gripper
|
| 451 |
hist_action_raw = torch.cat((hist_joint, hist_gripper), dim=-1).float()
|
| 452 |
-
extras["history_action"] =
|
| 453 |
if self._use_state:
|
| 454 |
initial_gripper = sample[_GRIPPER_STATE_FEATURE][-self._chunk_length - 1].unsqueeze(-1)
|
| 455 |
if self._is_gripper_action_flipped:
|
|
|
|
| 11 |
|
| 12 |
from cosmos_framework.utils import log
|
| 13 |
from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
|
|
|
|
| 14 |
ActionSpec,
|
| 15 |
BaseActionLeRobotDataset,
|
| 16 |
Gripper,
|
|
|
|
| 68 |
split: str = "train",
|
| 69 |
mode: str = "policy",
|
| 70 |
pose_convention: PoseConvention = "backward_framewise",
|
|
|
|
| 71 |
tolerance_s=2e-4,
|
| 72 |
viewpoint: Viewpoint = "concat_view",
|
| 73 |
use_success_only: bool = False,
|
|
|
|
| 91 |
viewpoint=viewpoint,
|
| 92 |
pose_convention=pose_convention,
|
| 93 |
rotation_format="rot6d",
|
|
|
|
| 94 |
tolerance_s=tolerance_s,
|
| 95 |
enable_fast_init=enable_fast_init,
|
| 96 |
)
|
|
|
|
| 395 |
axis=-1,
|
| 396 |
)
|
| 397 |
).float()
|
| 398 |
+
extras["history_action"] = hist_action_raw
|
| 399 |
if self._use_state:
|
| 400 |
initial_gripper = sample[_GRIPPER_STATE_FEATURE][0].unsqueeze(-1)
|
| 401 |
if self._is_gripper_action_flipped:
|
|
|
|
| 446 |
if self._is_gripper_action_flipped:
|
| 447 |
hist_gripper = 1.0 - hist_gripper
|
| 448 |
hist_action_raw = torch.cat((hist_joint, hist_gripper), dim=-1).float()
|
| 449 |
+
extras["history_action"] = hist_action_raw
|
| 450 |
if self._use_state:
|
| 451 |
initial_gripper = sample[_GRIPPER_STATE_FEATURE][-self._chunk_length - 1].unsqueeze(-1)
|
| 452 |
if self._is_gripper_action_flipped:
|
cosmos-framework/cosmos_framework/data/vfm/action/fractal.py
CHANGED
|
@@ -15,7 +15,6 @@ from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
|
|
| 15 |
|
| 16 |
from cosmos_framework.utils import log
|
| 17 |
from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
|
| 18 |
-
ActionNormalization,
|
| 19 |
ActionSpec,
|
| 20 |
BaseActionLeRobotDataset,
|
| 21 |
Gripper,
|
|
@@ -84,7 +83,6 @@ class FractalLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 84 |
split: str = "train",
|
| 85 |
mode: str = "policy",
|
| 86 |
pose_convention: PoseConvention = "backward_framewise",
|
| 87 |
-
action_normalization: ActionNormalization | None = None,
|
| 88 |
viewpoint: Viewpoint = "ego_view",
|
| 89 |
enable_fast_init: bool = False,
|
| 90 |
) -> None:
|
|
@@ -103,7 +101,6 @@ class FractalLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 103 |
actions. Supports ``"backward_framewise"`` and
|
| 104 |
``"backward_anchored"``. Set to ``None`` to disable action
|
| 105 |
construction outside image-to-video mode.
|
| 106 |
-
action_normalization: Optional bundled-stats normalization
|
| 107 |
(``"quantile"`` / ``"quantile_rot"`` / ``"meanstd"`` / ``"minmax"``);
|
| 108 |
``None`` returns raw actions.
|
| 109 |
viewpoint: Camera viewpoint type for this dataset.
|
|
@@ -119,7 +116,6 @@ class FractalLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 119 |
viewpoint=viewpoint,
|
| 120 |
pose_convention=pose_convention,
|
| 121 |
rotation_format="rot6d",
|
| 122 |
-
action_normalization=action_normalization,
|
| 123 |
tolerance_s=1e-4,
|
| 124 |
enable_fast_init=enable_fast_init,
|
| 125 |
)
|
|
|
|
| 15 |
|
| 16 |
from cosmos_framework.utils import log
|
| 17 |
from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
|
|
|
|
| 18 |
ActionSpec,
|
| 19 |
BaseActionLeRobotDataset,
|
| 20 |
Gripper,
|
|
|
|
| 83 |
split: str = "train",
|
| 84 |
mode: str = "policy",
|
| 85 |
pose_convention: PoseConvention = "backward_framewise",
|
|
|
|
| 86 |
viewpoint: Viewpoint = "ego_view",
|
| 87 |
enable_fast_init: bool = False,
|
| 88 |
) -> None:
|
|
|
|
| 101 |
actions. Supports ``"backward_framewise"`` and
|
| 102 |
``"backward_anchored"``. Set to ``None`` to disable action
|
| 103 |
construction outside image-to-video mode.
|
|
|
|
| 104 |
(``"quantile"`` / ``"quantile_rot"`` / ``"meanstd"`` / ``"minmax"``);
|
| 105 |
``None`` returns raw actions.
|
| 106 |
viewpoint: Camera viewpoint type for this dataset.
|
|
|
|
| 116 |
viewpoint=viewpoint,
|
| 117 |
pose_convention=pose_convention,
|
| 118 |
rotation_format="rot6d",
|
|
|
|
| 119 |
tolerance_s=1e-4,
|
| 120 |
enable_fast_init=enable_fast_init,
|
| 121 |
)
|
cosmos-framework/cosmos_framework/data/vfm/action/libero_dataset.py
DELETED
|
@@ -1,611 +0,0 @@
|
|
| 1 |
-
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
-
# SPDX-License-Identifier: OpenMDW-1.1
|
| 3 |
-
|
| 4 |
-
"""LIBERO dataset for training from local storage, supporting multiple dataset roots."""
|
| 5 |
-
|
| 6 |
-
import random
|
| 7 |
-
from pathlib import Path
|
| 8 |
-
from typing import Literal
|
| 9 |
-
|
| 10 |
-
import torch
|
| 11 |
-
import torchvision.transforms.functional as F
|
| 12 |
-
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
| 13 |
-
from torch.utils.data import Dataset
|
| 14 |
-
|
| 15 |
-
from cosmos_framework.utils import log
|
| 16 |
-
from cosmos_framework.data.vfm.action.action_normalization import (
|
| 17 |
-
load_action_stats,
|
| 18 |
-
normalize_action,
|
| 19 |
-
)
|
| 20 |
-
from cosmos_framework.data.vfm.action.action_spec import (
|
| 21 |
-
Gripper,
|
| 22 |
-
Pos,
|
| 23 |
-
Rot,
|
| 24 |
-
build_action_spec,
|
| 25 |
-
)
|
| 26 |
-
from cosmos_framework.data.vfm.action.domain_utils import get_domain_id
|
| 27 |
-
from cosmos_framework.data.vfm.action.libero_pose_utils import (
|
| 28 |
-
libero_action_dim,
|
| 29 |
-
libero_rotation_format,
|
| 30 |
-
)
|
| 31 |
-
from cosmos_framework.data.vfm.action.pose_utils import (
|
| 32 |
-
compute_idle_frames,
|
| 33 |
-
convert_rotation,
|
| 34 |
-
)
|
| 35 |
-
|
| 36 |
-
LIBERO_ROOTS: list[str] = [
|
| 37 |
-
"<PATH_TO_LIBERO_10>",
|
| 38 |
-
"<PATH_TO_LIBERO_90>",
|
| 39 |
-
"<PATH_TO_LIBERO_OBJECT>",
|
| 40 |
-
"<PATH_TO_LIBERO_SPATIAL>",
|
| 41 |
-
"<PATH_TO_LIBERO_GOAL>",
|
| 42 |
-
]
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
class LIBERODataset(Dataset):
|
| 46 |
-
"""
|
| 47 |
-
A Dataset wrapper for LeRobot LIBERO dataset(s) designed for training from local storage.
|
| 48 |
-
|
| 49 |
-
This dataset:
|
| 50 |
-
- Loads data from local storage using LeRobotDataset
|
| 51 |
-
- Supports multiple dataset roots that are concatenated into one dataset
|
| 52 |
-
- Supports configurable camera modes (image, wrist_image, or concat_view)
|
| 53 |
-
- Filters episodes for train/val split
|
| 54 |
-
- Filters frames at episode boundaries (to avoid padding issues with delta timestamps)
|
| 55 |
-
- Uses task descriptions from meta/tasks.parquet for ai_caption
|
| 56 |
-
"""
|
| 57 |
-
|
| 58 |
-
_NORMALIZERS_DIR = Path(__file__).parent / "normalizers"
|
| 59 |
-
|
| 60 |
-
def __init__(
|
| 61 |
-
self,
|
| 62 |
-
repo_id: str | list[str] = "lerobot/libero_90",
|
| 63 |
-
root: str | list[str] | None = LIBERO_ROOTS,
|
| 64 |
-
image_size: int = 256,
|
| 65 |
-
chunk_length: int = 16, # must be divisible by 4
|
| 66 |
-
fps: int = 10, # IMPORTANT! LIBERO is at 20fps. If using frame_wise_relative in policy mode, we have to match the fps.
|
| 67 |
-
mode: str = "policy",
|
| 68 |
-
video_backend: str | None = "torchcodec",
|
| 69 |
-
download_videos: bool = False,
|
| 70 |
-
force_cache_sync: bool = False,
|
| 71 |
-
tolerance_s: float = 1e-4,
|
| 72 |
-
split: str = "train",
|
| 73 |
-
val_ratio: float = 0.01,
|
| 74 |
-
seed: int = 0,
|
| 75 |
-
# Camera configuration
|
| 76 |
-
camera_mode: str = "image", # 'image', 'wrist_image', or 'concat_view'
|
| 77 |
-
# Action configuration
|
| 78 |
-
action_space: str = "frame_wise_relative", # "absolute" or "relative" or "frame_wise_relative"
|
| 79 |
-
# rotation_space
|
| 80 |
-
rotation_space: Literal["9d", "6d", "3d"] = "3d",
|
| 81 |
-
# Native simulator frame or shared OpenCV-style EE frame used by midtraining.
|
| 82 |
-
pose_coordinate_frame: Literal["native", "opencv"] = "native",
|
| 83 |
-
# domain-aware configuration
|
| 84 |
-
embodiment_type: str = "libero",
|
| 85 |
-
action_normalization: Literal["quantile", "quantile_rot", "meanstd", "minmax"] | None = None,
|
| 86 |
-
action_stats_path: str | None = None,
|
| 87 |
-
skip_video_loading: bool = False,
|
| 88 |
-
):
|
| 89 |
-
super().__init__()
|
| 90 |
-
self._embodiment_type = embodiment_type
|
| 91 |
-
self.domain_id = get_domain_id(embodiment_type)
|
| 92 |
-
self.image_size = image_size
|
| 93 |
-
self.chunk_length = chunk_length
|
| 94 |
-
assert self.chunk_length % 4 == 0, "chunk_length must be divisible by 4"
|
| 95 |
-
self.fps = fps
|
| 96 |
-
self.mode = mode
|
| 97 |
-
self.split = split.lower().strip()
|
| 98 |
-
self.val_ratio = val_ratio
|
| 99 |
-
self.seed = seed
|
| 100 |
-
self.camera_mode = camera_mode.lower().strip()
|
| 101 |
-
self.action_space = action_space
|
| 102 |
-
self.action_normalization = action_normalization
|
| 103 |
-
self.rotation_space = rotation_space.lower().strip()
|
| 104 |
-
self.pose_coordinate_frame = pose_coordinate_frame
|
| 105 |
-
self._pose_convention = self.action_space
|
| 106 |
-
self._rotation_format = libero_rotation_format(self.rotation_space)
|
| 107 |
-
# When True, skip video decoding entirely: drop image keys from
|
| 108 |
-
# delta_timestamps so LeRobot never touches the mp4, and return
|
| 109 |
-
# ``video=None`` in __getitem__. Must be set at construction time
|
| 110 |
-
# because LeRobotDataset is eagerly built in __init__.
|
| 111 |
-
self._skip_video_loading = bool(skip_video_loading)
|
| 112 |
-
|
| 113 |
-
# Load action normalization stats. ``action_min`` / ``action_range`` are
|
| 114 |
-
# retained for older LIBERO eval code that knows how to invert a
|
| 115 |
-
# range-style [-1, 1] normalization.
|
| 116 |
-
self._norm_stats: dict[str, torch.Tensor] | None = None
|
| 117 |
-
self.action_min: torch.Tensor | None = None
|
| 118 |
-
self.action_max: torch.Tensor | None = None
|
| 119 |
-
self.action_range: torch.Tensor | None = None
|
| 120 |
-
if self.action_normalization is not None:
|
| 121 |
-
stats_path = self._resolve_action_stats_path(action_stats_path)
|
| 122 |
-
stats_key = "global_raw" if self.action_normalization == "quantile_rot" else "global"
|
| 123 |
-
raw_stats = load_action_stats(str(stats_path), stats_key=stats_key)
|
| 124 |
-
self._norm_stats = {}
|
| 125 |
-
for key, value in raw_stats.items():
|
| 126 |
-
self._norm_stats[key] = torch.from_numpy(value).float() # [D]
|
| 127 |
-
self._set_range_denormalization_stats()
|
| 128 |
-
log.info(
|
| 129 |
-
f"Loaded LIBERO action stats from {stats_path} with action_normalization={self.action_normalization}"
|
| 130 |
-
)
|
| 131 |
-
|
| 132 |
-
# Validate camera mode
|
| 133 |
-
if self.camera_mode not in {"image", "wrist_image", "concat_view"}:
|
| 134 |
-
raise ValueError(f"Unsupported camera_mode={camera_mode!r}. Use 'image', 'wrist_image', or 'concat_view'.")
|
| 135 |
-
|
| 136 |
-
# Validate split
|
| 137 |
-
if self.split not in {"train", "val", "valid", "validation", "eval", "test", "full"}:
|
| 138 |
-
raise ValueError(f"Unsupported {split=}. Use train/val/full.")
|
| 139 |
-
|
| 140 |
-
# Build delta timestamps based on camera mode
|
| 141 |
-
dt = 1.0 / self.fps
|
| 142 |
-
|
| 143 |
-
if self.fps != 20:
|
| 144 |
-
log.warning(
|
| 145 |
-
f"LIBERO is at 20fps. If using frame_wise_relative for policy mode training, we have to match the fps. fps={self.fps}"
|
| 146 |
-
)
|
| 147 |
-
|
| 148 |
-
# Determine which image keys to use
|
| 149 |
-
if self.camera_mode == "image":
|
| 150 |
-
self.image_keys = ["observation.images.image"]
|
| 151 |
-
elif self.camera_mode == "wrist_image":
|
| 152 |
-
self.image_keys = ["observation.images.wrist_image"]
|
| 153 |
-
else: # concat_view
|
| 154 |
-
self.image_keys = ["observation.images.image", "observation.images.wrist_image"]
|
| 155 |
-
|
| 156 |
-
# Build delta_timestamps for all keys (same convention as PushT: 0 to chunk_length)
|
| 157 |
-
self.delta_timestamps: dict[str, list[float]] = {}
|
| 158 |
-
if not self._skip_video_loading:
|
| 159 |
-
for key in self.image_keys:
|
| 160 |
-
self.delta_timestamps[key] = [i * dt for i in range(0, chunk_length + 1)]
|
| 161 |
-
self.delta_timestamps["observation.state"] = [i * dt for i in range(0, chunk_length + 1)]
|
| 162 |
-
self.delta_timestamps["action"] = [i * dt for i in range(0, chunk_length + 1)]
|
| 163 |
-
|
| 164 |
-
# Normalize repo_id and root to lists
|
| 165 |
-
repo_id_list: list[str] = [repo_id] if isinstance(repo_id, str) else list(repo_id)
|
| 166 |
-
root_list: list[str | None]
|
| 167 |
-
if root is None:
|
| 168 |
-
root_list = [None for _ in repo_id_list]
|
| 169 |
-
elif isinstance(root, str):
|
| 170 |
-
root_list = [root]
|
| 171 |
-
else:
|
| 172 |
-
root_list = [r for r in root]
|
| 173 |
-
|
| 174 |
-
if len(repo_id_list) != len(root_list):
|
| 175 |
-
raise ValueError(
|
| 176 |
-
f"Length mismatch: repo_id has {len(repo_id_list)} items, root has {len(root_list)} items."
|
| 177 |
-
)
|
| 178 |
-
|
| 179 |
-
# Load all datasets
|
| 180 |
-
self.datasets: list[LeRobotDataset] = []
|
| 181 |
-
self.tasks_dfs: list = [] # Store tasks DataFrames for each dataset
|
| 182 |
-
for rid, r in zip(repo_id_list, root_list):
|
| 183 |
-
dataset = LeRobotDataset(
|
| 184 |
-
repo_id=rid,
|
| 185 |
-
root=r,
|
| 186 |
-
delta_timestamps=self.delta_timestamps, # type: ignore
|
| 187 |
-
tolerance_s=tolerance_s,
|
| 188 |
-
force_cache_sync=force_cache_sync,
|
| 189 |
-
download_videos=download_videos,
|
| 190 |
-
video_backend=video_backend,
|
| 191 |
-
episodes=None, # Load full dataset, filter later
|
| 192 |
-
)
|
| 193 |
-
self.datasets.append(dataset)
|
| 194 |
-
self.tasks_dfs.append(dataset.meta.tasks)
|
| 195 |
-
|
| 196 |
-
# Build index mapping: list of (dataset_idx, local_idx) for valid frames
|
| 197 |
-
self.index_map: list[tuple[int, int, int]] = [] # (dataset_idx, local_idx, episode_idx)
|
| 198 |
-
self._episode_boundaries: list[dict[int, tuple[int, int]]] = []
|
| 199 |
-
self._episode_splits: list[tuple[set[int], set[int]]] = []
|
| 200 |
-
|
| 201 |
-
total_episodes = 0
|
| 202 |
-
total_frames = 0
|
| 203 |
-
for ds_idx, dataset in enumerate(self.datasets):
|
| 204 |
-
# Compute episode splits for this dataset
|
| 205 |
-
train_eps, val_eps = self._compute_episode_splits_for_dataset(dataset)
|
| 206 |
-
self._episode_splits.append((train_eps, val_eps))
|
| 207 |
-
|
| 208 |
-
# Get episodes for current split
|
| 209 |
-
split_episodes = self._get_split_episodes_for_dataset(ds_idx)
|
| 210 |
-
|
| 211 |
-
# Build episode boundaries
|
| 212 |
-
boundaries = self._build_episode_boundaries_for_dataset(dataset)
|
| 213 |
-
self._episode_boundaries.append(boundaries)
|
| 214 |
-
|
| 215 |
-
# Filter indices
|
| 216 |
-
indices = self._filter_indices_for_dataset(ds_idx, dataset, split_episodes, boundaries)
|
| 217 |
-
self.index_map.extend(indices)
|
| 218 |
-
|
| 219 |
-
total_episodes += dataset.num_episodes
|
| 220 |
-
total_frames += len(dataset)
|
| 221 |
-
|
| 222 |
-
log.info(
|
| 223 |
-
f"Loaded LIBERO dataset with {len(repo_id_list)} source(s) split={self.split!r} "
|
| 224 |
-
f"camera_mode={self.camera_mode!r} "
|
| 225 |
-
f"total_episodes={total_episodes} "
|
| 226 |
-
f"total_frames={total_frames} "
|
| 227 |
-
f"valid_indices={len(self.index_map)}"
|
| 228 |
-
)
|
| 229 |
-
|
| 230 |
-
def _compute_episode_splits_for_dataset(self, dataset: LeRobotDataset) -> tuple[set[int], set[int]]:
|
| 231 |
-
"""Compute train/val episode splits deterministically for a single dataset."""
|
| 232 |
-
total_episodes = int(dataset.meta.total_episodes)
|
| 233 |
-
|
| 234 |
-
if not (0.0 < self.val_ratio < 1.0):
|
| 235 |
-
raise ValueError(f"{self.val_ratio=} must be in (0, 1).")
|
| 236 |
-
|
| 237 |
-
n_val = max(1, int(round(total_episodes * self.val_ratio)))
|
| 238 |
-
# val_eps = set(range(n_val))
|
| 239 |
-
# train_eps = set(range(n_val, total_episodes))
|
| 240 |
-
|
| 241 |
-
# Yihuai: Randomly select validation episodes instead of the first n_val episodes (otherwise task will be repeated)
|
| 242 |
-
rng = random.Random(self.seed) # To ensure validation episodes are the same on all ranks
|
| 243 |
-
val_eps = set(rng.sample(range(total_episodes), n_val))
|
| 244 |
-
train_eps = set(range(total_episodes)) - val_eps
|
| 245 |
-
|
| 246 |
-
log.info(f"train_eps={train_eps}, val_eps={val_eps}")
|
| 247 |
-
|
| 248 |
-
return train_eps, val_eps
|
| 249 |
-
|
| 250 |
-
def _get_split_episodes_for_dataset(self, ds_idx: int) -> set[int]:
|
| 251 |
-
"""Get the episode set for the current split for a specific dataset."""
|
| 252 |
-
train_eps, val_eps = self._episode_splits[ds_idx]
|
| 253 |
-
if self.split in {"val", "valid", "validation", "eval", "test"}:
|
| 254 |
-
return val_eps
|
| 255 |
-
elif self.split == "train":
|
| 256 |
-
return train_eps
|
| 257 |
-
else: # full
|
| 258 |
-
return train_eps | val_eps
|
| 259 |
-
|
| 260 |
-
def _build_episode_boundaries_for_dataset(self, dataset: LeRobotDataset) -> dict[int, tuple[int, int]]:
|
| 261 |
-
"""Build a dict of episode_index -> (start_frame, end_frame) for a single dataset."""
|
| 262 |
-
boundaries: dict[int, tuple[int, int]] = {}
|
| 263 |
-
for ep in dataset.meta.episodes:
|
| 264 |
-
ep_idx = int(ep["episode_index"]) # type: ignore[index]
|
| 265 |
-
start = int(ep["dataset_from_index"]) # type: ignore[index]
|
| 266 |
-
end = int(ep["dataset_to_index"]) # type: ignore[index]
|
| 267 |
-
boundaries[ep_idx] = (start, end)
|
| 268 |
-
return boundaries
|
| 269 |
-
|
| 270 |
-
def _filter_indices_for_dataset(
|
| 271 |
-
self,
|
| 272 |
-
ds_idx: int,
|
| 273 |
-
dataset: LeRobotDataset,
|
| 274 |
-
split_episodes: set[int],
|
| 275 |
-
boundaries: dict[int, tuple[int, int]],
|
| 276 |
-
) -> list[tuple[int, int, int]]:
|
| 277 |
-
"""Filter valid indices for a single dataset, returning (dataset_idx, local_idx, episode_idx)."""
|
| 278 |
-
index_map: list[tuple[int, int, int]] = []
|
| 279 |
-
all_meta = list(dataset.meta.episodes)
|
| 280 |
-
|
| 281 |
-
for ep_idx in split_episodes:
|
| 282 |
-
if ep_idx >= len(all_meta):
|
| 283 |
-
continue
|
| 284 |
-
ep = all_meta[ep_idx]
|
| 285 |
-
|
| 286 |
-
ep_start = int(ep["dataset_from_index"]) # type: ignore[index]
|
| 287 |
-
ep_end = int(ep["dataset_to_index"]) # type: ignore[index]
|
| 288 |
-
|
| 289 |
-
# Valid range: [start, end - chunk_length - 1] inclusive
|
| 290 |
-
# We drop chunk_length frames at end to ensure we can query up to delta=chunk_length.
|
| 291 |
-
start = ep_start
|
| 292 |
-
end = ep_end - self.chunk_length - 1
|
| 293 |
-
|
| 294 |
-
if end >= start:
|
| 295 |
-
for local_idx in range(start, end + 1):
|
| 296 |
-
index_map.append((ds_idx, local_idx, ep_idx))
|
| 297 |
-
|
| 298 |
-
return index_map
|
| 299 |
-
|
| 300 |
-
def __len__(self) -> int:
|
| 301 |
-
return len(self.index_map)
|
| 302 |
-
|
| 303 |
-
def _get_task_description(self, ds_idx: int, item: dict) -> str:
|
| 304 |
-
"""Get task description for the current item from meta/tasks.parquet.
|
| 305 |
-
|
| 306 |
-
The tasks.parquet has task descriptions as the DataFrame index (row labels)
|
| 307 |
-
and task_index as an integer column. We look up by task_index and return
|
| 308 |
-
the corresponding index name (the actual task description string).
|
| 309 |
-
"""
|
| 310 |
-
task_idx = item.get("task_index")
|
| 311 |
-
if task_idx is not None:
|
| 312 |
-
if isinstance(task_idx, torch.Tensor):
|
| 313 |
-
task_idx = task_idx.item()
|
| 314 |
-
task_idx = int(task_idx)
|
| 315 |
-
tasks_df = self.tasks_dfs[ds_idx]
|
| 316 |
-
if task_idx in tasks_df["task_index"].values:
|
| 317 |
-
row = tasks_df[tasks_df["task_index"] == task_idx].iloc[0]
|
| 318 |
-
# The task description is the index name (row label), not a column value
|
| 319 |
-
return str(row.name)
|
| 320 |
-
raise ValueError(f"Task index {task_idx} not found in tasks.parquet for dataset {ds_idx}")
|
| 321 |
-
|
| 322 |
-
def _compute_anchored_actions(
|
| 323 |
-
self,
|
| 324 |
-
state_raw: torch.Tensor,
|
| 325 |
-
action_raw: torch.Tensor,
|
| 326 |
-
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 327 |
-
"""Compute anchored relative actions (batched).
|
| 328 |
-
|
| 329 |
-
Converts frame-wise relative actions to anchored relative actions where each
|
| 330 |
-
action[t] represents the target pose (after applying action[t] to state[t])
|
| 331 |
-
expressed in state 0's local coordinate frame.
|
| 332 |
-
|
| 333 |
-
Mathematical formulation:
|
| 334 |
-
1. Compute target in world frame (LIBERO convention):
|
| 335 |
-
- p_{t+1} = p_t + delta_p[t] (position addition in world frame)
|
| 336 |
-
- R_{t+1} = R_delta[t] @ R_t (rotation composition, delta first)
|
| 337 |
-
2. Compute anchored (left-multiply by T_0^{-1}):
|
| 338 |
-
- anchored_pos[t] = R_0^T @ (p_{t+1} - p_0)
|
| 339 |
-
- anchored_rot[t] = R_0^T @ R_{t+1}
|
| 340 |
-
|
| 341 |
-
Args:
|
| 342 |
-
state_raw: State tensor of shape (T+1, 8): [x, y, z, ax, ay, az, grip1, grip2]
|
| 343 |
-
where (ax, ay, az) is axis-angle rotation.
|
| 344 |
-
action_raw: Action tensor of shape (T+1, 7): [dx, dy, dz, dax, day, daz, grip]
|
| 345 |
-
where (dax, day, daz) is axis-angle rotation delta.
|
| 346 |
-
|
| 347 |
-
Returns:
|
| 348 |
-
anchored_translation: (T, 3) - position in state_0's local frame
|
| 349 |
-
anchored_rotation_9d: (T, 9) - rotation relative to state_0 as flattened 3x3 matrix
|
| 350 |
-
gripper: (T, 1) - original gripper commands (unchanged)
|
| 351 |
-
"""
|
| 352 |
-
# Extract positions and rotations from states
|
| 353 |
-
p_states = state_raw[:, :3] # [T+1,3]
|
| 354 |
-
rotvec_states = state_raw[:, 3:6] # [T+1,3] - axis-angle
|
| 355 |
-
|
| 356 |
-
# Extract deltas from actions (use first T actions)
|
| 357 |
-
delta_p = action_raw[:-1, :3] # [T,3]
|
| 358 |
-
delta_rotvec = action_raw[:-1, 3:6] # [T,3] - axis-angle delta
|
| 359 |
-
gripper = action_raw[:-1, 6:7] # [T,1]
|
| 360 |
-
|
| 361 |
-
# Convert all axis-angle to rotation matrices (batched)
|
| 362 |
-
R_states = convert_rotation(rotvec_states, input_format="axisangle", output_format="matrix") # [T+1,3,3]
|
| 363 |
-
R_deltas = convert_rotation(delta_rotvec, input_format="axisangle", output_format="matrix") # [T,3,3]
|
| 364 |
-
|
| 365 |
-
# Initial pose (state 0)
|
| 366 |
-
p_0 = p_states[0] # [3]
|
| 367 |
-
R_0 = R_states[0] # [3,3]
|
| 368 |
-
R_0_T = R_0.T # [3,3] - transpose for inverse rotation
|
| 369 |
-
|
| 370 |
-
# Current states for t = 0..T-1
|
| 371 |
-
p_t = p_states[:-1] # [T,3]
|
| 372 |
-
R_t = R_states[:-1] # [T,3,3]
|
| 373 |
-
|
| 374 |
-
# Step 1: Compute target poses in world frame (LIBERO convention)
|
| 375 |
-
# p_target = p_t + delta_p
|
| 376 |
-
p_target = p_t + delta_p # [T,3]
|
| 377 |
-
|
| 378 |
-
# R_target = R_delta @ R_t (batched matrix multiply)
|
| 379 |
-
R_target = torch.bmm(R_deltas, R_t) # [T,3,3]
|
| 380 |
-
|
| 381 |
-
# Step 2: Compute anchored (in state_0's local frame)
|
| 382 |
-
# anchored_p = R_0^T @ (p_target - p_0)
|
| 383 |
-
displacement = p_target - p_0 # [T,3]
|
| 384 |
-
anchored_p = (R_0_T @ displacement.T).T # [T,3]
|
| 385 |
-
|
| 386 |
-
# anchored_R = R_0^T @ R_target (batched)
|
| 387 |
-
R_0_T_expanded = R_0_T.unsqueeze(0).expand(R_target.shape[0], -1, -1) # [T,3,3]
|
| 388 |
-
anchored_R = torch.bmm(R_0_T_expanded, R_target) # [T,3,3]
|
| 389 |
-
|
| 390 |
-
return anchored_p, anchored_R, gripper
|
| 391 |
-
|
| 392 |
-
def _convert_rotation_to_repr(self, rotation_matrix: torch.Tensor) -> torch.Tensor:
|
| 393 |
-
"""Convert rotation matrix to the desired representation.
|
| 394 |
-
|
| 395 |
-
Args:
|
| 396 |
-
rotation_matrix: Rotation matrices of shape (T, 3, 3).
|
| 397 |
-
|
| 398 |
-
Returns:
|
| 399 |
-
Rotation in the configured ``rotation_space`` format.
|
| 400 |
-
"""
|
| 401 |
-
return convert_rotation(rotation_matrix, "matrix", libero_rotation_format(self.rotation_space))
|
| 402 |
-
|
| 403 |
-
def _normalizer_filename(self) -> str:
|
| 404 |
-
rotation_suffix = {
|
| 405 |
-
"3d": "3d",
|
| 406 |
-
"6d": "rot6d",
|
| 407 |
-
"9d": "rot9d",
|
| 408 |
-
}.get(self.rotation_space)
|
| 409 |
-
if rotation_suffix is None:
|
| 410 |
-
raise ValueError(f"Unsupported rotation_space={self.rotation_space!r}.")
|
| 411 |
-
action_space = self.action_space.replace("-", "_")
|
| 412 |
-
return f"{self._embodiment_type}_{action_space}_{rotation_suffix}.json"
|
| 413 |
-
|
| 414 |
-
def _resolve_action_stats_path(self, action_stats_path: str | None) -> Path:
|
| 415 |
-
if action_stats_path is None:
|
| 416 |
-
stats_path = self._NORMALIZERS_DIR / self._normalizer_filename()
|
| 417 |
-
if stats_path.exists():
|
| 418 |
-
return stats_path
|
| 419 |
-
raise FileNotFoundError(
|
| 420 |
-
f"Could not find bundled LIBERO action stats at {stats_path}. "
|
| 421 |
-
"Pass action_stats_path explicitly or regenerate stats with compute_action_stats.py."
|
| 422 |
-
)
|
| 423 |
-
|
| 424 |
-
stats_path = Path(action_stats_path)
|
| 425 |
-
if stats_path.is_absolute():
|
| 426 |
-
if stats_path.exists():
|
| 427 |
-
return stats_path
|
| 428 |
-
raise FileNotFoundError(f"Could not find action_stats_path={action_stats_path!r}.")
|
| 429 |
-
|
| 430 |
-
module_dir = Path(__file__).resolve().parent
|
| 431 |
-
candidates: list[Path] = []
|
| 432 |
-
for parent in module_dir.parents:
|
| 433 |
-
candidates.append(parent / stats_path)
|
| 434 |
-
candidates.append(self._NORMALIZERS_DIR / stats_path.name)
|
| 435 |
-
candidates.append(module_dir / stats_path.name)
|
| 436 |
-
for candidate in candidates:
|
| 437 |
-
if candidate.exists():
|
| 438 |
-
return candidate
|
| 439 |
-
raise FileNotFoundError(
|
| 440 |
-
f"Could not resolve action_stats_path={action_stats_path!r}; tried: {[str(c) for c in candidates]}"
|
| 441 |
-
)
|
| 442 |
-
|
| 443 |
-
def _set_range_denormalization_stats(self) -> None:
|
| 444 |
-
if self._norm_stats is None:
|
| 445 |
-
return
|
| 446 |
-
|
| 447 |
-
if self.action_normalization == "minmax":
|
| 448 |
-
lo_key, hi_key = "min", "max"
|
| 449 |
-
elif self.action_normalization in ("quantile", "quantile_rot"):
|
| 450 |
-
lo_key, hi_key = "q01", "q99"
|
| 451 |
-
else:
|
| 452 |
-
return
|
| 453 |
-
|
| 454 |
-
if lo_key not in self._norm_stats or hi_key not in self._norm_stats:
|
| 455 |
-
raise ValueError(
|
| 456 |
-
f"Action stats for {self.action_normalization!r} normalization require "
|
| 457 |
-
f"{lo_key!r} and {hi_key!r} entries."
|
| 458 |
-
)
|
| 459 |
-
self.action_min = self._norm_stats[lo_key] # [D]
|
| 460 |
-
self.action_max = self._norm_stats[hi_key] # [D]
|
| 461 |
-
action_range = self.action_max - self.action_min # [D]
|
| 462 |
-
self.action_range = torch.clamp(action_range, min=1e-6) # [D]
|
| 463 |
-
|
| 464 |
-
def __getitem__(self, idx: int, _retry_count: int = 0) -> dict[str, torch.Tensor | str]:
|
| 465 |
-
"""Get a single item from the dataset."""
|
| 466 |
-
max_retries = 10
|
| 467 |
-
ds_idx, local_idx, ep_idx = self.index_map[idx]
|
| 468 |
-
dataset = self.datasets[ds_idx]
|
| 469 |
-
try:
|
| 470 |
-
item = dataset[local_idx]
|
| 471 |
-
except Exception as e:
|
| 472 |
-
log.warning(
|
| 473 |
-
f"Error loading item (retry {_retry_count}/{max_retries}): idx={idx}, ds_idx={ds_idx}, "
|
| 474 |
-
f"local_idx={local_idx}, ep_idx={ep_idx}, repo_id={dataset.meta.repo_id}, error={e}"
|
| 475 |
-
)
|
| 476 |
-
if _retry_count >= max_retries:
|
| 477 |
-
raise RuntimeError(f"Failed to load data after {max_retries} retries") from e
|
| 478 |
-
new_idx = random.randint(0, len(self) - 1)
|
| 479 |
-
return self.__getitem__(new_idx, _retry_count + 1)
|
| 480 |
-
|
| 481 |
-
if self.mode == "joint":
|
| 482 |
-
mode = random.choice(["forward_dynamics", "inverse_dynamics", "policy", "image2video"])
|
| 483 |
-
else:
|
| 484 |
-
mode = self.mode
|
| 485 |
-
|
| 486 |
-
# Get task description for ai_caption
|
| 487 |
-
task_description = self._get_task_description(ds_idx, item)
|
| 488 |
-
|
| 489 |
-
# Process video based on camera mode (skipped entirely when
|
| 490 |
-
# skip_video_loading=True; image keys are also absent from
|
| 491 |
-
# delta_timestamps so LeRobot never decoded them).
|
| 492 |
-
video: torch.Tensor | None
|
| 493 |
-
if self._skip_video_loading:
|
| 494 |
-
video = None
|
| 495 |
-
else:
|
| 496 |
-
if self.camera_mode == "concat_view":
|
| 497 |
-
# Load both cameras and concatenate horizontally
|
| 498 |
-
video_1: torch.Tensor = item["observation.images.image"]
|
| 499 |
-
video_2: torch.Tensor = item["observation.images.wrist_image"]
|
| 500 |
-
|
| 501 |
-
# Resize each if needed
|
| 502 |
-
if video_1.shape[-1] != self.image_size or video_1.shape[-2] != self.image_size:
|
| 503 |
-
video_1 = F.resize(video_1, [self.image_size, self.image_size])
|
| 504 |
-
if video_2.shape[-1] != self.image_size or video_2.shape[-2] != self.image_size:
|
| 505 |
-
video_2 = F.resize(video_2, [self.image_size, self.image_size])
|
| 506 |
-
|
| 507 |
-
# Concatenate along width dimension (last dim for TCHW)
|
| 508 |
-
video_tchw = torch.cat([video_1, video_2], dim=-1) # (T, C, H, W*2)
|
| 509 |
-
else:
|
| 510 |
-
# Single camera mode
|
| 511 |
-
image_key = self.image_keys[0]
|
| 512 |
-
video_tchw = item[image_key]
|
| 513 |
-
|
| 514 |
-
# Resize if needed
|
| 515 |
-
if video_tchw.shape[-1] != self.image_size or video_tchw.shape[-2] != self.image_size:
|
| 516 |
-
video_tchw = F.resize(video_tchw, [self.image_size, self.image_size])
|
| 517 |
-
|
| 518 |
-
# Convert to uint8 and transpose to (C, T, H, W)
|
| 519 |
-
video = (video_tchw * 255).clamp(0, 255).to(torch.uint8).permute(1, 0, 2, 3)
|
| 520 |
-
|
| 521 |
-
# Action (raw): LIBERO actions are 7D (6 DoF + gripper)
|
| 522 |
-
action_raw: torch.Tensor = item["action"]
|
| 523 |
-
# State (raw): LIBERO state is 8D (6 DoF + 2 gripper states)
|
| 524 |
-
state_raw: torch.Tensor = item["observation.state"]
|
| 525 |
-
|
| 526 |
-
# Action: (T+1, D) -> (T, D)
|
| 527 |
-
# Take all but last action
|
| 528 |
-
# LIBERO action format: [x, y, z, ax, ay, az, gripper] (7D) where (ax,ay,az) is axis-angle
|
| 529 |
-
|
| 530 |
-
if self.action_space == "relative":
|
| 531 |
-
# Compute anchored relative actions
|
| 532 |
-
# Returns: translation (T, 3), rotation_matrix (T, 3, 3), gripper (T, 1)
|
| 533 |
-
translation, rotation_matrix, gripper = self._compute_anchored_actions(state_raw, action_raw.clone())
|
| 534 |
-
elif self.action_space == "frame_wise_relative":
|
| 535 |
-
action = action_raw[:-1].clone() # [T,7]
|
| 536 |
-
translation = action[:, :3] # [T,3]
|
| 537 |
-
rotation_rotvec = action[:, 3:6] # [T,3]
|
| 538 |
-
gripper = action[:, 6:] # [T,1]
|
| 539 |
-
rotation_matrix = convert_rotation(
|
| 540 |
-
rotation_rotvec, input_format="axisangle", output_format="matrix"
|
| 541 |
-
) # [T,3,3]
|
| 542 |
-
else:
|
| 543 |
-
raise ValueError(f"Unsupported action space: {self.action_space}")
|
| 544 |
-
|
| 545 |
-
rotation = self._convert_rotation_to_repr(rotation_matrix) # [T,rot_dim]
|
| 546 |
-
action = torch.cat([translation, rotation, gripper], dim=-1) # [T,action_dim]
|
| 547 |
-
|
| 548 |
-
# Compute idle_frames from the raw (un-normalized) action, only when the
|
| 549 |
-
# action layout has correct per-frame idle semantics (frame_wise_relative
|
| 550 |
-
# ⇔ backward_framewise). The other action_spaces ("relative",
|
| 551 |
-
# "absolute") encode per-frame motion differently and would not give
|
| 552 |
-
# meaningful idle counts under the same threshold check.
|
| 553 |
-
idle_frames: torch.Tensor | None = None
|
| 554 |
-
if self.action_space == "frame_wise_relative":
|
| 555 |
-
try:
|
| 556 |
-
spec = build_action_spec(Pos(), Rot(libero_rotation_format(self.rotation_space)), Gripper())
|
| 557 |
-
n = compute_idle_frames(action, spec)
|
| 558 |
-
idle_frames = torch.tensor(n, dtype=torch.long)
|
| 559 |
-
except (ValueError, TypeError):
|
| 560 |
-
idle_frames = None
|
| 561 |
-
|
| 562 |
-
if self.action_normalization is not None and self._norm_stats is not None and self.action_min is not None:
|
| 563 |
-
if action.shape[-1] != self.action_min.shape[0]:
|
| 564 |
-
raise ValueError(
|
| 565 |
-
f"Action dimension {action.shape[-1]} does not match stats dimension "
|
| 566 |
-
f"{self.action_min.shape[0]}. Recompute stats for the current "
|
| 567 |
-
f"rotation_space={self.rotation_space!r} and action_space={self.action_space!r}."
|
| 568 |
-
)
|
| 569 |
-
method = "quantile" if self.action_normalization == "quantile_rot" else self.action_normalization
|
| 570 |
-
action = normalize_action(action, method, self._norm_stats) # [T,D]
|
| 571 |
-
|
| 572 |
-
# Index
|
| 573 |
-
key = torch.tensor([local_idx], dtype=torch.long)
|
| 574 |
-
|
| 575 |
-
if self.camera_mode == "image":
|
| 576 |
-
viewpoint = "third_person_view"
|
| 577 |
-
elif self.camera_mode == "wrist_image":
|
| 578 |
-
viewpoint = "wrist_view"
|
| 579 |
-
else:
|
| 580 |
-
viewpoint = "concat_view"
|
| 581 |
-
|
| 582 |
-
result: dict[str, torch.Tensor | str] = {
|
| 583 |
-
"source_repo_id": dataset.meta.repo_id,
|
| 584 |
-
"video": video,
|
| 585 |
-
"action": action,
|
| 586 |
-
"action_raw": action_raw,
|
| 587 |
-
"conditioning_fps": torch.tensor(self.fps, dtype=torch.long),
|
| 588 |
-
"prompt": task_description,
|
| 589 |
-
"ai_caption": task_description,
|
| 590 |
-
"mode": mode,
|
| 591 |
-
"state": state_raw,
|
| 592 |
-
"action_space": self.action_space,
|
| 593 |
-
"rotation_space": self.rotation_space,
|
| 594 |
-
"pose_coordinate_frame": self.pose_coordinate_frame,
|
| 595 |
-
"__key__": key,
|
| 596 |
-
"domain_id": torch.tensor(self.domain_id, dtype=torch.long),
|
| 597 |
-
"viewpoint": viewpoint,
|
| 598 |
-
}
|
| 599 |
-
if idle_frames is not None:
|
| 600 |
-
result["idle_frames"] = idle_frames
|
| 601 |
-
|
| 602 |
-
if self.camera_mode == "concat_view" and not self._skip_video_loading:
|
| 603 |
-
result["additional_view_description"] = (
|
| 604 |
-
"The left half shows the third-person view; the right half shows the wrist-mounted camera."
|
| 605 |
-
)
|
| 606 |
-
|
| 607 |
-
return result
|
| 608 |
-
|
| 609 |
-
@property
|
| 610 |
-
def action_dim(self) -> int:
|
| 611 |
-
return libero_action_dim(self.rotation_space)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosmos-framework/cosmos_framework/data/vfm/action/libero_pose_utils.py
DELETED
|
@@ -1,69 +0,0 @@
|
|
| 1 |
-
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
-
# SPDX-License-Identifier: OpenMDW-1.1
|
| 3 |
-
|
| 4 |
-
"""Small LIBERO pose helpers shared by training and closed-loop eval."""
|
| 5 |
-
|
| 6 |
-
from __future__ import annotations
|
| 7 |
-
|
| 8 |
-
import numpy as np
|
| 9 |
-
import torch
|
| 10 |
-
|
| 11 |
-
from cosmos_framework.data.vfm.action.pose_utils import (
|
| 12 |
-
RotationConvention,
|
| 13 |
-
build_abs_pose_from_components,
|
| 14 |
-
)
|
| 15 |
-
|
| 16 |
-
# Same local-frame post-rotation pattern used by DROID/Bridge/Fractal:
|
| 17 |
-
# R_opencv = R_native @ *_TO_OPENCV.
|
| 18 |
-
LIBERO_TO_OPENCV: np.ndarray = np.array(
|
| 19 |
-
[[0.0, -1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 1.0]],
|
| 20 |
-
dtype=np.float32,
|
| 21 |
-
)
|
| 22 |
-
|
| 23 |
-
LIBERO_ROTATION_FORMATS: dict[str, RotationConvention] = {
|
| 24 |
-
"3d": "axisangle",
|
| 25 |
-
"6d": "rot6d",
|
| 26 |
-
"9d": "rot9d",
|
| 27 |
-
}
|
| 28 |
-
LIBERO_ACTION_DIMS: dict[str, int] = {"3d": 7, "6d": 10, "9d": 13}
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def libero_rotation_format(rotation_space: str) -> RotationConvention:
|
| 32 |
-
"""Return the shared ``pose_utils`` rotation format for a LIBERO setting."""
|
| 33 |
-
rotation_format = LIBERO_ROTATION_FORMATS.get(rotation_space)
|
| 34 |
-
if rotation_format is None:
|
| 35 |
-
raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.")
|
| 36 |
-
return rotation_format
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def libero_action_dim(rotation_space: str) -> int:
|
| 40 |
-
"""Return ``[xyz, rotation, gripper]`` action width for LIBERO."""
|
| 41 |
-
action_dim = LIBERO_ACTION_DIMS.get(rotation_space)
|
| 42 |
-
if action_dim is None:
|
| 43 |
-
raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.")
|
| 44 |
-
return action_dim
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
def libero_rotation_space_from_action_dim(action_dim: int) -> str:
|
| 48 |
-
"""Infer LIBERO rotation space from unpadded action width."""
|
| 49 |
-
for rotation_space, dim in LIBERO_ACTION_DIMS.items():
|
| 50 |
-
if dim == action_dim:
|
| 51 |
-
return rotation_space
|
| 52 |
-
raise ValueError(f"Unable to infer rotation_space from action_dim={action_dim}.")
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
def build_libero_abs_pose(state_raw: torch.Tensor | np.ndarray, *, to_opencv: bool) -> np.ndarray:
|
| 56 |
-
"""Build absolute LIBERO EE poses from state rows.
|
| 57 |
-
|
| 58 |
-
``state_raw`` is ``[x,y,z,axisangle(3),gripper(2)]``. When requested, the
|
| 59 |
-
local EE frame is post-rotated into the shared OpenCV-style action frame.
|
| 60 |
-
"""
|
| 61 |
-
if isinstance(state_raw, torch.Tensor):
|
| 62 |
-
state_np = state_raw.detach().cpu().numpy().astype(np.float32, copy=False)
|
| 63 |
-
else:
|
| 64 |
-
state_np = np.asarray(state_raw, dtype=np.float32)
|
| 65 |
-
|
| 66 |
-
poses_abs = build_abs_pose_from_components(state_np[:, :3], state_np[:, 3:6], "axisangle")
|
| 67 |
-
if to_opencv:
|
| 68 |
-
poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ LIBERO_TO_OPENCV
|
| 69 |
-
return poses_abs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/bridge_orig_lerobot_backward_framewise_rot6d.json
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"metadata": {
|
| 3 |
-
"embodiment_type": "bridge_orig_lerobot",
|
| 4 |
-
"pose_convention": "backward_framewise",
|
| 5 |
-
"rotation_format": "rot6d",
|
| 6 |
-
"action_dim": 10,
|
| 7 |
-
"skip_rotation_dims": [3, 4, 5, 6, 7, 8],
|
| 8 |
-
"chunk_length": 16,
|
| 9 |
-
"sample_stride": 16,
|
| 10 |
-
"dataset_name": "bridge_20260416",
|
| 11 |
-
"dataset_class": "BridgeOrigLeRobotDataset",
|
| 12 |
-
"dataset_root": "",
|
| 13 |
-
"split": "train",
|
| 14 |
-
"num_samples_stats": 83036,
|
| 15 |
-
"reservoir_size": 5000000
|
| 16 |
-
},
|
| 17 |
-
"global": {
|
| 18 |
-
"mean": [-0.000094, -0.000394, 0.001623, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.582683],
|
| 19 |
-
"std": [ 0.013297, 0.009985, 0.012079, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.489959],
|
| 20 |
-
"min": [-0.309451, -0.074740, -0.082767, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 21 |
-
"max": [ 0.127018, 0.414660, 0.493186, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000],
|
| 22 |
-
"q01": [-0.038884, -0.028667, -0.037840, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 23 |
-
"q99": [ 0.039722, 0.029068, 0.026702, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000]
|
| 24 |
-
},
|
| 25 |
-
"global_raw": {
|
| 26 |
-
"mean": [-0.000094, -0.000394, 0.001623, 0.998307, -0.001371, 0.000061, 0.001414, 0.998226, -0.000154, 0.582683],
|
| 27 |
-
"std": [ 0.013297, 0.009985, 0.012079, 0.004630, 0.050168, 0.029018, 0.050165, 0.004328, 0.031742, 0.489959],
|
| 28 |
-
"min": [-0.309451, -0.074740, -0.082767, -0.845782, -0.636628, -0.401535, -0.590214, -0.217448, -0.979635, 0.000000],
|
| 29 |
-
"max": [ 0.127018, 0.414660, 0.493186, 1.000000, 0.362611, 0.601211, 0.619479, 1.000000, 0.365993, 1.000000],
|
| 30 |
-
"q01": [-0.038884, -0.028667, -0.037840, 0.976292, -0.163098, -0.081545, -0.160193, 0.976322, -0.078872, 0.000000],
|
| 31 |
-
"q99": [ 0.039722, 0.029068, 0.026702, 1.000000, 0.160195, 0.081655, 0.163227, 1.000000, 0.095189, 1.000000]
|
| 32 |
-
}
|
| 33 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/droid_lerobot_backward_framewise_rot6d.json
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"metadata": {
|
| 3 |
-
"embodiment_type": "droid_lerobot",
|
| 4 |
-
"pose_convention": "backward_framewise",
|
| 5 |
-
"rotation_format": "rot6d",
|
| 6 |
-
"action_dim": 10,
|
| 7 |
-
"skip_rotation_dims": [3, 4, 5, 6, 7, 8],
|
| 8 |
-
"chunk_length": 16,
|
| 9 |
-
"sample_stride": 16,
|
| 10 |
-
"dataset_name": "droid_20260418",
|
| 11 |
-
"dataset_class": "DROIDLeRobotDataset",
|
| 12 |
-
"dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/droid_plus_lerobot_640x360_20260412",
|
| 13 |
-
"split": "train",
|
| 14 |
-
"num_samples_stats": 1321153,
|
| 15 |
-
"reservoir_size": 5000000
|
| 16 |
-
},
|
| 17 |
-
"global": {
|
| 18 |
-
"mean": [-0.000017, -0.000612, 0.000568, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.588911],
|
| 19 |
-
"std": [ 0.004539, 0.004054, 0.004999, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.441186],
|
| 20 |
-
"min": [-0.075397, -0.057288, -0.056677, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 21 |
-
"max": [ 0.073107, 0.082187, 0.077080, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000],
|
| 22 |
-
"q01": [-0.014200, -0.013416, -0.015206, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 23 |
-
"q99": [ 0.014515, 0.011517, 0.014520, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000]
|
| 24 |
-
},
|
| 25 |
-
"global_raw": {
|
| 26 |
-
"mean": [-0.000017, -0.000612, 0.000568, 0.999830, 0.000227, -0.000152, -0.000222, 0.999818, 0.000417, 0.588911],
|
| 27 |
-
"std": [ 0.004539, 0.004054, 0.004999, 0.000336, 0.014924, 0.010784, 0.014927, 0.000351, 0.011903, 0.441186],
|
| 28 |
-
"min": [-0.075397, -0.057288, -0.056677, 0.695640, -0.220599, -0.195892, -0.697421, 0.600468, -0.154176, 0.000000],
|
| 29 |
-
"max": [ 0.073107, 0.082187, 0.077080, 1.000000, 0.698449, 0.168089, 0.220605, 1.000000, 0.391206, 1.000000],
|
| 30 |
-
"q01": [-0.014200, -0.013416, -0.015206, 0.998459, -0.047659, -0.034774, -0.047609, 0.998428, -0.035553, 0.000000],
|
| 31 |
-
"q99": [ 0.014515, 0.011517, 0.014520, 1.000000, 0.047596, 0.034660, 0.047654, 1.000000, 0.038888, 1.000000]
|
| 32 |
-
}
|
| 33 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/fractal_backward_framewise_rot6d.json
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"metadata": {
|
| 3 |
-
"embodiment_type": "fractal",
|
| 4 |
-
"pose_convention": "backward_framewise",
|
| 5 |
-
"rotation_format": "rot6d",
|
| 6 |
-
"action_dim": 10,
|
| 7 |
-
"skip_rotation_dims": [3, 4, 5, 6, 7, 8],
|
| 8 |
-
"chunk_length": 16,
|
| 9 |
-
"sample_stride": 16,
|
| 10 |
-
"dataset_name": "fractal_20260413",
|
| 11 |
-
"dataset_class": "FractalLeRobotDataset",
|
| 12 |
-
"dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/fractal20220817_data_no_noops",
|
| 13 |
-
"split": "train",
|
| 14 |
-
"num_samples_stats": 166961,
|
| 15 |
-
"reservoir_size": 5000000
|
| 16 |
-
},
|
| 17 |
-
"global": {
|
| 18 |
-
"mean": [ 0.002259, 0.000721, 0.009372, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.526947],
|
| 19 |
-
"std": [ 0.014178, 0.016428, 0.022554, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.499273],
|
| 20 |
-
"min": [-0.151886, -0.176424, -0.194576, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 21 |
-
"max": [ 0.130892, 0.190835, 0.193839, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000],
|
| 22 |
-
"q01": [-0.039816, -0.049270, -0.056266, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 23 |
-
"q99": [ 0.043860, 0.050352, 0.072505, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000]
|
| 24 |
-
},
|
| 25 |
-
"global_raw": {
|
| 26 |
-
"mean": [ 0.002259, 0.000721, 0.009372, 0.998347, 0.001789, 0.002694, -0.001861, 0.997811, 0.016366, 0.526947],
|
| 27 |
-
"std": [ 0.014178, 0.016428, 0.022554, 0.003377, 0.043416, 0.037369, 0.043211, 0.004566, 0.047057, 0.499273],
|
| 28 |
-
"min": [-0.151886, -0.176424, -0.194576, 0.520558, -0.676280, -0.822475, -0.460521, 0.736643, -0.517041, 0.000000],
|
| 29 |
-
"max": [ 0.130892, 0.190835, 0.193839, 1.000000, 0.461026, 0.403940, 0.671708, 1.000000, 0.505528, 1.000000],
|
| 30 |
-
"q01": [-0.039816, -0.049270, -0.056266, 0.983667, -0.134543, -0.107048, -0.126518, 0.977277, -0.091363, 0.000000],
|
| 31 |
-
"q99": [ 0.043860, 0.050352, 0.072505, 1.000000, 0.127404, 0.107273, 0.134140, 1.000000, 0.179731, 1.000000]
|
| 32 |
-
}
|
| 33 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/libero_native_frame_wise_relative_rot6d.json
DELETED
|
@@ -1,37 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"metadata": {
|
| 3 |
-
"embodiment_type": "libero",
|
| 4 |
-
"pose_convention": "frame_wise_relative",
|
| 5 |
-
"pose_coordinate_frame": "native",
|
| 6 |
-
"rotation_format": "6d",
|
| 7 |
-
"action_dim": 10,
|
| 8 |
-
"skip_rotation_dims": [3, 4, 5, 6, 7, 8],
|
| 9 |
-
"chunk_length": 16,
|
| 10 |
-
"sample_stride": null,
|
| 11 |
-
"dataset_name": "libero",
|
| 12 |
-
"dataset_class": "LIBERODataset",
|
| 13 |
-
"dataset_root": ["outputs/libero_datasets/libero_10", "outputs/libero_datasets/libero_object", "outputs/libero_datasets/libero_spatial", "outputs/libero_datasets/libero_goal"],
|
| 14 |
-
"_comment": "Dataset paths are placeholders; the statistics values are independent of local dataset location.",
|
| 15 |
-
"split": "train",
|
| 16 |
-
"num_samples_stats": 10000,
|
| 17 |
-
"reservoir_size": 50000,
|
| 18 |
-
"max_samples": 10000,
|
| 19 |
-
"sampling_seed": 42
|
| 20 |
-
},
|
| 21 |
-
"global": {
|
| 22 |
-
"mean": [ 0.050704, 0.097407, -0.094833, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.476725],
|
| 23 |
-
"std": [ 0.333621, 0.387175, 0.457140, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.499460],
|
| 24 |
-
"min": [-0.937500, -0.937500, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 25 |
-
"max": [ 0.937500, 0.937500, 0.937500, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000],
|
| 26 |
-
"q01": [-0.723214, -0.808929, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 27 |
-
"q99": [ 0.937500, 0.870536, 0.937500, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000]
|
| 28 |
-
},
|
| 29 |
-
"global_raw": {
|
| 30 |
-
"mean": [ 0.050704, 0.097407, -0.094833, 0.994873, -0.004579, -0.004288, 0.004389, 0.996104, 0.001109, 0.476725],
|
| 31 |
-
"std": [ 0.333621, 0.387175, 0.457140, 0.010807, 0.077802, 0.063386, 0.078571, 0.009994, 0.038504, 0.499460],
|
| 32 |
-
"min": [-0.937500, -0.937500, -0.937500, 0.902028, -0.356085, -0.367416, -0.370434, 0.921907, -0.255000, 0.000000],
|
| 33 |
-
"max": [ 0.937500, 0.937500, 0.937500, 1.000000, 0.368853, 0.341214, 0.356395, 1.000000, 0.348251, 1.000000],
|
| 34 |
-
"q01": [-0.723214, -0.808929, -0.937500, 0.934955, -0.223431, -0.189878, -0.334735, 0.938516, -0.107736, 0.000000],
|
| 35 |
-
"q99": [ 0.937500, 0.870536, 0.937500, 1.000000, 0.331000, 0.163153, 0.226216, 1.000000, 0.127158, 1.000000]
|
| 36 |
-
}
|
| 37 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka-dual_backward_framewise_rot6d.json
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"metadata": {
|
| 3 |
-
"embodiment_type": "robomind-franka-dual",
|
| 4 |
-
"pose_convention": "backward_framewise",
|
| 5 |
-
"rotation_format": "rot6d",
|
| 6 |
-
"action_dim": 20,
|
| 7 |
-
"skip_rotation_dims": [3, 4, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18],
|
| 8 |
-
"chunk_length": 16,
|
| 9 |
-
"sample_stride": 16,
|
| 10 |
-
"dataset_name": "robomind_franka_dual_20260414",
|
| 11 |
-
"dataset_class": "RoboMINDFrankaDataset",
|
| 12 |
-
"dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228",
|
| 13 |
-
"split": "train",
|
| 14 |
-
"num_samples_stats": 21410,
|
| 15 |
-
"reservoir_size": 5000000
|
| 16 |
-
},
|
| 17 |
-
"global": {
|
| 18 |
-
"mean": [ 0.000231, 0.000179, -0.000319, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.638652, 0.000148, -0.000377, -0.000241, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.815273],
|
| 19 |
-
"std": [ 0.014881, 0.008081, 0.014371, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.464058, 0.010628, 0.005868, 0.007900, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.366049],
|
| 20 |
-
"min": [-0.115093, -0.096415, -0.112595, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000, -0.091252, -0.052148, -0.113650, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 21 |
-
"max": [ 0.114941, 0.063433, 0.098721, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.123908, 0.077951, 0.080229, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.999628],
|
| 22 |
-
"q01": [-0.051367, -0.031964, -0.046482, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000, -0.035108, -0.021212, -0.029788, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 23 |
-
"q99": [ 0.043729, 0.021737, 0.036738, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.047581, 0.021270, 0.025712, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.995443]
|
| 24 |
-
},
|
| 25 |
-
"global_raw": {
|
| 26 |
-
"mean": [ 0.000231, 0.000179, -0.000319, 0.999350, 0.000318, -0.000135, -0.000286, 0.999586, -0.000051, 0.638652, 0.000148, -0.000377, -0.000241, 0.999294, 0.000343, 0.000431, -0.000147, 0.999580, -0.000570, 0.815273],
|
| 27 |
-
"std": [ 0.014881, 0.008081, 0.014371, 0.002235, 0.020196, 0.029781, 0.020185, 0.001125, 0.020484, 0.464058, 0.010628, 0.005868, 0.007900, 0.002664, 0.025404, 0.027550, 0.025193, 0.001657, 0.014210, 0.366049],
|
| 28 |
-
"min": [-0.115093, -0.096415, -0.112595, 0.944314, -0.271877, -0.325264, -0.254808, 0.962274, -0.227188, 0.000000, -0.091252, -0.052148, -0.113650, 0.941406, -0.265241, -0.273484, -0.290840, 0.954990, -0.264631, 0.000000],
|
| 29 |
-
"max": [ 0.114941, 0.063433, 0.098721, 1.000000, 0.258475, 0.270230, 0.271943, 1.000000, 0.221936, 1.000000, 0.123908, 0.077951, 0.080229, 1.000000, 0.296517, 0.333596, 0.269131, 1.000000, 0.139695, 0.999628],
|
| 30 |
-
"q01": [-0.051367, -0.031964, -0.046482, 0.988101, -0.053179, -0.128603, -0.075432, 0.994427, -0.059973, 0.000000, -0.035108, -0.021212, -0.029788, 0.986086, -0.098043, -0.111441, -0.093441, 0.991492, -0.058030, 0.000000],
|
| 31 |
-
"q99": [ 0.043729, 0.021737, 0.036738, 1.000000, 0.075612, 0.102791, 0.053223, 1.000000, 0.077057, 1.000000, 0.047581, 0.021270, 0.025712, 1.000000, 0.095525, 0.126049, 0.098778, 1.000000, 0.041914, 0.995443]
|
| 32 |
-
}
|
| 33 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka_backward_framewise_rot6d.json
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"metadata": {
|
| 3 |
-
"embodiment_type": "robomind-franka",
|
| 4 |
-
"pose_convention": "backward_framewise",
|
| 5 |
-
"rotation_format": "rot6d",
|
| 6 |
-
"action_dim": 10,
|
| 7 |
-
"skip_rotation_dims": [3, 4, 5, 6, 7, 8],
|
| 8 |
-
"chunk_length": 16,
|
| 9 |
-
"sample_stride": 16,
|
| 10 |
-
"dataset_name": "robomind_franka_20260414",
|
| 11 |
-
"dataset_class": "RoboMINDFrankaDataset",
|
| 12 |
-
"dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228",
|
| 13 |
-
"split": "train",
|
| 14 |
-
"num_samples_stats": 141658,
|
| 15 |
-
"reservoir_size": 5000000
|
| 16 |
-
},
|
| 17 |
-
"global": {
|
| 18 |
-
"mean": [ 0.000241, 0.000073, -0.000597, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.630501],
|
| 19 |
-
"std": [ 0.020545, 0.010725, 0.022054, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.434021],
|
| 20 |
-
"min": [-0.184377, -0.130924, -0.183947, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 21 |
-
"max": [ 0.227682, 0.134118, 0.133222, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000],
|
| 22 |
-
"q01": [-0.065029, -0.030683, -0.075321, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000],
|
| 23 |
-
"q99": [ 0.068546, 0.036309, 0.051772, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000]
|
| 24 |
-
},
|
| 25 |
-
"global_raw": {
|
| 26 |
-
"mean": [ 0.000241, 0.000073, -0.000597, 0.998782, 0.000605, 0.000003, -0.000592, 0.998245, -0.000508, 0.630501],
|
| 27 |
-
"std": [ 0.020545, 0.010725, 0.022054, 0.004056, 0.043101, 0.023671, 0.043102, 0.004948, 0.040306, 0.434021],
|
| 28 |
-
"min": [-0.184377, -0.130924, -0.183947, 0.837403, -0.525301, -0.384252, -0.543663, 0.801190, -0.490979, 0.000000],
|
| 29 |
-
"max": [ 0.227682, 0.134118, 0.133222, 1.000000, 0.543800, 0.389145, 0.522029, 1.000000, 0.414190, 1.000000],
|
| 30 |
-
"q01": [-0.065029, -0.030683, -0.075321, 0.981664, -0.137429, -0.069593, -0.140220, 0.976885, -0.140399, 0.000000],
|
| 31 |
-
"q99": [ 0.068546, 0.036309, 0.051772, 1.000000, 0.140290, 0.079942, 0.137529, 1.000000, 0.113651, 1.000000]
|
| 32 |
-
}
|
| 33 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosmos-framework/cosmos_framework/data/vfm/action/robomind_franka_dataset.py
CHANGED
|
@@ -25,7 +25,6 @@ import torch
|
|
| 25 |
import torch.nn.functional as F
|
| 26 |
|
| 27 |
from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
|
| 28 |
-
ActionNormalization,
|
| 29 |
ActionSpec,
|
| 30 |
BaseActionLeRobotDataset,
|
| 31 |
Gripper,
|
|
@@ -98,7 +97,6 @@ class RoboMINDFrankaDataset(BaseActionLeRobotDataset):
|
|
| 98 |
mode: str = "policy",
|
| 99 |
embodiment_type: str = "robomind-franka",
|
| 100 |
pose_convention: str = "backward_framewise",
|
| 101 |
-
action_normalization: ActionNormalization | None = None,
|
| 102 |
viewpoint: Viewpoint = "concat_view",
|
| 103 |
enable_fast_init: bool = False,
|
| 104 |
) -> None:
|
|
@@ -119,7 +117,6 @@ class RoboMINDFrankaDataset(BaseActionLeRobotDataset):
|
|
| 119 |
viewpoint=viewpoint,
|
| 120 |
pose_convention=pose_convention,
|
| 121 |
rotation_format="rot6d",
|
| 122 |
-
action_normalization=action_normalization,
|
| 123 |
tolerance_s=1e-4,
|
| 124 |
enable_fast_init=enable_fast_init,
|
| 125 |
)
|
|
|
|
| 25 |
import torch.nn.functional as F
|
| 26 |
|
| 27 |
from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import (
|
|
|
|
| 28 |
ActionSpec,
|
| 29 |
BaseActionLeRobotDataset,
|
| 30 |
Gripper,
|
|
|
|
| 97 |
mode: str = "policy",
|
| 98 |
embodiment_type: str = "robomind-franka",
|
| 99 |
pose_convention: str = "backward_framewise",
|
|
|
|
| 100 |
viewpoint: Viewpoint = "concat_view",
|
| 101 |
enable_fast_init: bool = False,
|
| 102 |
) -> None:
|
|
|
|
| 117 |
viewpoint=viewpoint,
|
| 118 |
pose_convention=pose_convention,
|
| 119 |
rotation_format="rot6d",
|
|
|
|
| 120 |
tolerance_s=1e-4,
|
| 121 |
enable_fast_init=enable_fast_init,
|
| 122 |
)
|
cosmos-framework/cosmos_framework/data/vfm/action/umi_lerobot_dataset.py
CHANGED
|
@@ -12,7 +12,7 @@ import torch
|
|
| 12 |
from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
|
| 13 |
|
| 14 |
from cosmos_framework.data.vfm.action.action_spec import ActionSpec, Gripper, Pos, Rot, build_action_spec
|
| 15 |
-
from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import
|
| 16 |
from cosmos_framework.data.vfm.action.pose_utils import PoseConvention, build_abs_pose_from_components, pose_abs_to_rel
|
| 17 |
from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
|
| 18 |
|
|
@@ -35,7 +35,6 @@ class UMIFastLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 35 |
split: Literal["train", "val", "full"] = "train",
|
| 36 |
mode: str = "policy",
|
| 37 |
pose_convention: PoseConvention = "backward_framewise",
|
| 38 |
-
action_normalization: ActionNormalization | None = None,
|
| 39 |
viewpoint: Viewpoint = "ego_view",
|
| 40 |
enable_fast_init: bool = False,
|
| 41 |
) -> None:
|
|
@@ -50,7 +49,6 @@ class UMIFastLeRobotDataset(BaseActionLeRobotDataset):
|
|
| 50 |
viewpoint=viewpoint,
|
| 51 |
pose_convention=pose_convention,
|
| 52 |
rotation_format="rot6d",
|
| 53 |
-
action_normalization=action_normalization,
|
| 54 |
tolerance_s=1e-4,
|
| 55 |
enable_fast_init=enable_fast_init,
|
| 56 |
)
|
|
|
|
| 12 |
from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
|
| 13 |
|
| 14 |
from cosmos_framework.data.vfm.action.action_spec import ActionSpec, Gripper, Pos, Rot, build_action_spec
|
| 15 |
+
from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import BaseActionLeRobotDataset
|
| 16 |
from cosmos_framework.data.vfm.action.pose_utils import PoseConvention, build_abs_pose_from_components, pose_abs_to_rel
|
| 17 |
from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint
|
| 18 |
|
|
|
|
| 35 |
split: Literal["train", "val", "full"] = "train",
|
| 36 |
mode: str = "policy",
|
| 37 |
pose_convention: PoseConvention = "backward_framewise",
|
|
|
|
| 38 |
viewpoint: Viewpoint = "ego_view",
|
| 39 |
enable_fast_init: bool = False,
|
| 40 |
) -> None:
|
|
|
|
| 49 |
viewpoint=viewpoint,
|
| 50 |
pose_convention=pose_convention,
|
| 51 |
rotation_format="rot6d",
|
|
|
|
| 52 |
tolerance_s=1e-4,
|
| 53 |
enable_fast_init=enable_fast_init,
|
| 54 |
)
|
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/README.md
CHANGED
|
@@ -4,10 +4,15 @@ Interactive 3D viewer for release-supported robot action datasets. It renders
|
|
| 4 |
RGB frames, end-effector trajectories, and robot meshes/IK with
|
| 5 |
[`viser`](https://viser.studio/), MuJoCo, and Pinocchio.
|
| 6 |
|
| 7 |
-
The
|
| 8 |
|
| 9 |
-
-
|
| 10 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
The dataset LazyConfig entries live in:
|
| 13 |
|
|
@@ -17,29 +22,10 @@ cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py
|
|
| 17 |
|
| 18 |
## Dataset roots
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
```bash
|
| 25 |
-
# Bridge root: downloaded from nvidia/bridge_lerobot_v3 or an existing mirror.
|
| 26 |
-
export BRIDGE_LEROBOT_ROOT=/path/to/bridge_lerobot_v3
|
| 27 |
-
|
| 28 |
-
# LIBERO root: parent directory containing libero_10/, libero_object/,
|
| 29 |
-
# libero_spatial/, and libero_goal/.
|
| 30 |
-
export LIBERO_ROOT=/path/to/LIBERO_LeRobot_v3
|
| 31 |
-
```
|
| 32 |
-
|
| 33 |
-
To download the public datasets instead:
|
| 34 |
-
|
| 35 |
-
```bash
|
| 36 |
-
uvx hf@latest download --repo-type dataset nvidia/bridge_lerobot_v3 \
|
| 37 |
-
--local-dir examples/data/bridge_lerobot_v3 --quiet
|
| 38 |
-
|
| 39 |
-
uvx hf@latest download --repo-type dataset nvidia/LIBERO_LeRobot_v3 \
|
| 40 |
-
--revision ddc1edeb6e51e2b7d4d2ba7a1433daaecd37aa64 \
|
| 41 |
-
--local-dir examples/data/LIBERO_LeRobot_v3 --quiet
|
| 42 |
-
```
|
| 43 |
|
| 44 |
## Local launch
|
| 45 |
|
|
@@ -53,8 +39,6 @@ Then launch from the repository root:
|
|
| 53 |
|
| 54 |
```bash
|
| 55 |
export PYTHONPATH=.
|
| 56 |
-
export BRIDGE_LEROBOT_ROOT=/path/to/bridge_lerobot_v3
|
| 57 |
-
export LIBERO_ROOT=/path/to/LIBERO_LeRobot_v3
|
| 58 |
|
| 59 |
python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py \
|
| 60 |
--port 8020 \
|
|
|
|
| 4 |
RGB frames, end-effector trajectories, and robot meshes/IK with
|
| 5 |
[`viser`](https://viser.studio/), MuJoCo, and Pinocchio.
|
| 6 |
|
| 7 |
+
The standalone viewer registry is intentionally scoped to the packaged demo datasets:
|
| 8 |
|
| 9 |
+
- AV
|
| 10 |
+
- Fractal
|
| 11 |
+
- Bridge
|
| 12 |
+
- DROID
|
| 13 |
+
- UMI / FastUMI
|
| 14 |
+
- RoboMIND Franka
|
| 15 |
+
- RoboMIND Franka dual
|
| 16 |
|
| 17 |
The dataset LazyConfig entries live in:
|
| 18 |
|
|
|
|
| 22 |
|
| 23 |
## Dataset roots
|
| 24 |
|
| 25 |
+
The Docker Space uses packaged examples under `/app/assets/examples`. For local
|
| 26 |
+
development, override individual roots only when needed, e.g. `BRIDGE_ROOT`,
|
| 27 |
+
`FRACTAL_ROOT`, `DROID_ROOT`, `UMI_ROOT`, `ROBOMIND_FRANKA_ROOT`,
|
| 28 |
+
`ROBOMIND_FRANKA_DUAL_ROOT`, or `AV_ROOT`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
## Local launch
|
| 31 |
|
|
|
|
| 39 |
|
| 40 |
```bash
|
| 41 |
export PYTHONPATH=.
|
|
|
|
|
|
|
| 42 |
|
| 43 |
python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py \
|
| 44 |
--port 8020 \
|
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py
CHANGED
|
@@ -49,7 +49,6 @@ BRIDGE_ROOT = _env_path("BRIDGE_LEROBOT_ROOT", "DATASET_PATH", default="/app/ass
|
|
| 49 |
DATASET_BRIDGE_480 = L(dataset_entry)(
|
| 50 |
name="bridge_20260501",
|
| 51 |
dataset=L(BridgeOrigLeRobotDataset)(
|
| 52 |
-
action_normalization="quantile_rot",
|
| 53 |
chunk_length=16,
|
| 54 |
enable_fast_init=True,
|
| 55 |
fps=5.0,
|
|
@@ -72,7 +71,6 @@ DATASET_FRACTAL_256 = L(dataset_entry)(
|
|
| 72 |
root=_env_path("FRACTAL_ROOT", default="/app/assets/examples/fractal20220817_data"),
|
| 73 |
split="train",
|
| 74 |
mode="joint",
|
| 75 |
-
action_normalization="quantile_rot",
|
| 76 |
enable_fast_init=False,
|
| 77 |
),
|
| 78 |
ratio=1,
|
|
@@ -86,7 +84,6 @@ DATASET_DROID_480 = L(dataset_entry)(
|
|
| 86 |
split="train",
|
| 87 |
use_success_only=True,
|
| 88 |
mode="joint",
|
| 89 |
-
action_normalization="quantile_rot",
|
| 90 |
enable_fast_init=False,
|
| 91 |
),
|
| 92 |
ratio=1,
|
|
@@ -100,7 +97,6 @@ DATASET_ROBOMIND_FRANKA_480 = L(dataset_entry)(
|
|
| 100 |
split="train",
|
| 101 |
mode="joint",
|
| 102 |
embodiment_type="robomind-franka",
|
| 103 |
-
action_normalization="quantile_rot",
|
| 104 |
enable_fast_init=False,
|
| 105 |
),
|
| 106 |
ratio=1,
|
|
@@ -114,7 +110,6 @@ DATASET_ROBOMIND_FRANKA_DUAL_480 = L(dataset_entry)(
|
|
| 114 |
split="train",
|
| 115 |
mode="joint",
|
| 116 |
embodiment_type="robomind-franka-dual",
|
| 117 |
-
action_normalization="quantile_rot",
|
| 118 |
enable_fast_init=False,
|
| 119 |
),
|
| 120 |
ratio=1,
|
|
@@ -151,7 +146,6 @@ DATASET_UMI_256 = L(dataset_entry)(
|
|
| 151 |
root=_env_path("UMI_ROOT", default="/app/assets/examples/fastumi/fastumi_single_arm/pour_coke"),
|
| 152 |
split="train",
|
| 153 |
mode="joint",
|
| 154 |
-
action_normalization=None,
|
| 155 |
enable_fast_init=False,
|
| 156 |
),
|
| 157 |
ratio=1,
|
|
|
|
| 49 |
DATASET_BRIDGE_480 = L(dataset_entry)(
|
| 50 |
name="bridge_20260501",
|
| 51 |
dataset=L(BridgeOrigLeRobotDataset)(
|
|
|
|
| 52 |
chunk_length=16,
|
| 53 |
enable_fast_init=True,
|
| 54 |
fps=5.0,
|
|
|
|
| 71 |
root=_env_path("FRACTAL_ROOT", default="/app/assets/examples/fractal20220817_data"),
|
| 72 |
split="train",
|
| 73 |
mode="joint",
|
|
|
|
| 74 |
enable_fast_init=False,
|
| 75 |
),
|
| 76 |
ratio=1,
|
|
|
|
| 84 |
split="train",
|
| 85 |
use_success_only=True,
|
| 86 |
mode="joint",
|
|
|
|
| 87 |
enable_fast_init=False,
|
| 88 |
),
|
| 89 |
ratio=1,
|
|
|
|
| 97 |
split="train",
|
| 98 |
mode="joint",
|
| 99 |
embodiment_type="robomind-franka",
|
|
|
|
| 100 |
enable_fast_init=False,
|
| 101 |
),
|
| 102 |
ratio=1,
|
|
|
|
| 110 |
split="train",
|
| 111 |
mode="joint",
|
| 112 |
embodiment_type="robomind-franka-dual",
|
|
|
|
| 113 |
enable_fast_init=False,
|
| 114 |
),
|
| 115 |
ratio=1,
|
|
|
|
| 146 |
root=_env_path("UMI_ROOT", default="/app/assets/examples/fastumi/fastumi_single_arm/pour_coke"),
|
| 147 |
split="train",
|
| 148 |
mode="joint",
|
|
|
|
| 149 |
enable_fast_init=False,
|
| 150 |
),
|
| 151 |
ratio=1,
|
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ik_solver.py
CHANGED
|
@@ -575,8 +575,8 @@ def compute_mujoco_geom_transforms(
|
|
| 575 |
for i, mi in enumerate(pin_to_mj_map):
|
| 576 |
data.qpos[mi] = q[i]
|
| 577 |
else:
|
| 578 |
-
# For robots with a separate gripper ctrl signal
|
| 579 |
-
#
|
| 580 |
# not a qpos DOF. For other robots the pinocchio output already
|
| 581 |
# includes finger joints in the trailing columns; write them all.
|
| 582 |
n_set = n_arm if has_gripper_ctrl else len(q)
|
|
|
|
| 575 |
for i, mi in enumerate(pin_to_mj_map):
|
| 576 |
data.qpos[mi] = q[i]
|
| 577 |
else:
|
| 578 |
+
# For robots with a separate gripper ctrl signal, only write arm
|
| 579 |
+
# joints — the extra column is a raw gripper value,
|
| 580 |
# not a qpos DOF. For other robots the pinocchio output already
|
| 581 |
# includes finger joints in the trailing columns; write them all.
|
| 582 |
n_set = n_arm if has_gripper_ctrl else len(q)
|
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_action.py
CHANGED
|
@@ -1,33 +1,23 @@
|
|
| 1 |
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
# SPDX-License-Identifier: OpenMDW-1.1
|
| 3 |
|
| 4 |
-
"""
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
Each 9D SE(3) slot is ``[pos(3) + rot6d(6)]``.
|
| 11 |
-
Each finger slot is 3D (position in wrist-local frame), 5 fingers × 3D = 15D.
|
| 12 |
-
|
| 13 |
-
Any supported action format is converted to ``UnifiedAction(action_57d, mask)``
|
| 14 |
-
before the viewer processes it. The mask explicitly declares which slots are valid.
|
| 15 |
"""
|
| 16 |
|
| 17 |
from __future__ import annotations
|
| 18 |
|
| 19 |
-
from dataclasses import dataclass
|
| 20 |
from enum import Enum
|
| 21 |
from typing import Any
|
| 22 |
|
| 23 |
import numpy as np
|
| 24 |
import torch
|
| 25 |
|
| 26 |
-
from cosmos_framework.utils import log
|
| 27 |
-
from cosmos_framework.data.vfm.action.pose_utils import convert_rotation
|
| 28 |
-
|
| 29 |
-
FINGER_NAMES = ("thumb", "index", "middle", "ring", "pinky")
|
| 30 |
-
ALL_FINGERS = (True, True, True, True, True)
|
| 31 |
NO_FINGERS = (False, False, False, False, False)
|
| 32 |
|
| 33 |
|
|
@@ -37,30 +27,15 @@ class ActionFormat(str, Enum):
|
|
| 37 |
EGO_9D = "9d"
|
| 38 |
SINGLE_ARM_10D = "10d"
|
| 39 |
DUAL_ARM_20D = "20d"
|
| 40 |
-
UNIFIED_57D = "57d"
|
| 41 |
|
| 42 |
@property
|
| 43 |
def expected_dim(self) -> int:
|
| 44 |
-
|
| 45 |
-
return {
|
| 46 |
-
ActionFormat.EGO_9D: 9,
|
| 47 |
-
ActionFormat.SINGLE_ARM_10D: 10,
|
| 48 |
-
ActionFormat.DUAL_ARM_20D: 20,
|
| 49 |
-
ActionFormat.UNIFIED_57D: 57,
|
| 50 |
-
}[self]
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
# ─── Data Structures ─────────────────────────────────────────────────────────
|
| 54 |
|
| 55 |
|
| 56 |
@dataclass
|
| 57 |
class Action57DMask:
|
| 58 |
-
"""Per-component validity for the 57D
|
| 59 |
-
|
| 60 |
-
``right_fingers`` / ``left_fingers`` are tuples of 5 bools
|
| 61 |
-
(thumb, index, middle, ring, pinky) — supports any combination from
|
| 62 |
-
2-finger grippers to full 5-finger hands.
|
| 63 |
-
"""
|
| 64 |
|
| 65 |
ego: bool = False
|
| 66 |
right_wrist: bool = False
|
|
@@ -71,71 +46,41 @@ class Action57DMask:
|
|
| 71 |
|
| 72 |
@dataclass
|
| 73 |
class UnifiedAction:
|
| 74 |
-
"""
|
| 75 |
-
|
| 76 |
-
``action`` is always shape ``(T, 57)`` with invalid slots zero-padded.
|
| 77 |
-
``gripper_right`` / ``gripper_left`` carry auxiliary scalar gripper data
|
| 78 |
-
for embodiments that don't map to finger positions (V-shape visualisation).
|
| 79 |
-
"""
|
| 80 |
|
| 81 |
-
action: np.ndarray
|
| 82 |
mask: Action57DMask
|
| 83 |
-
gripper_right: np.ndarray | None = None
|
| 84 |
-
gripper_left: np.ndarray | None = None
|
| 85 |
|
| 86 |
|
| 87 |
@dataclass
|
| 88 |
class SceneState:
|
| 89 |
-
"""
|
| 90 |
|
| 91 |
-
|
| 92 |
-
- all SE(3) trajectories live in one shared ``scene_world`` frame
|
| 93 |
-
- fingertip positions are world-space if present
|
| 94 |
-
- gripper signals are scalar open/close values sampled at ``T+1`` frames
|
| 95 |
-
"""
|
| 96 |
-
|
| 97 |
-
mask: Action57DMask = field(default_factory=Action57DMask)
|
| 98 |
-
# Absolute SE(3) trajectories — (T+1, 4, 4)
|
| 99 |
ego_poses: np.ndarray | None = None
|
| 100 |
right_poses: np.ndarray | None = None
|
| 101 |
left_poses: np.ndarray | None = None
|
| 102 |
-
# World-space fingertip positions — (T+1, 5, 3)
|
| 103 |
right_fingers: np.ndarray | None = None
|
| 104 |
left_fingers: np.ndarray | None = None
|
| 105 |
-
# Scalar gripper — (T+1,)
|
| 106 |
gripper_right: np.ndarray | None = None
|
| 107 |
gripper_left: np.ndarray | None = None
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
action_raw: np.ndarray | None = None # canonical 57D action tensor for display
|
| 111 |
T: int = 0
|
| 112 |
-
# FK mesh animation: raw (T, nq) joint configs populated by datasets that
|
| 113 |
-
# perform EE conversion internally (e.g. robomind-ur). When set, the renderer
|
| 114 |
-
# uses these for FK mesh animation instead of running IK on right_poses.
|
| 115 |
-
joint_configs: np.ndarray | None = None
|
| 116 |
-
|
| 117 |
|
| 118 |
-
# ─── Converters ───────────────────────────────────────────────────────────────
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
mask=Action57DMask(
|
| 126 |
-
ego=True,
|
| 127 |
-
right_wrist=True,
|
| 128 |
-
right_fingers=ALL_FINGERS,
|
| 129 |
-
left_wrist=True,
|
| 130 |
-
left_fingers=ALL_FINGERS,
|
| 131 |
-
),
|
| 132 |
-
)
|
| 133 |
|
| 134 |
|
| 135 |
def to_unified_from_10d(action: np.ndarray) -> UnifiedAction:
|
| 136 |
"""10D single arm ``[pos(3)+rot6d(6)+grip(1)]`` → right wrist + gripper."""
|
| 137 |
-
|
| 138 |
-
a = np.zeros((T, 57), dtype=np.float32) # [T,57]
|
| 139 |
a[:, 9:18] = action[:, :9]
|
| 140 |
return UnifiedAction(
|
| 141 |
action=a,
|
|
@@ -145,75 +90,50 @@ def to_unified_from_10d(action: np.ndarray) -> UnifiedAction:
|
|
| 145 |
|
| 146 |
|
| 147 |
def to_unified_from_20d(action: np.ndarray) -> UnifiedAction:
|
| 148 |
-
"""20D dual arm ``[left(10) | right(10)]`` → both wrists + both grippers.
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
"""
|
| 153 |
-
T = action.shape[0]
|
| 154 |
-
a = np.zeros((T, 57), dtype=np.float32) # [T,57]
|
| 155 |
-
a[:, 33:42] = action[:, :9] # left arm → left wrist slot [33:42]
|
| 156 |
-
a[:, 9:18] = action[:, 10:19] # right arm → right wrist slot [9:18]
|
| 157 |
return UnifiedAction(
|
| 158 |
action=a,
|
| 159 |
mask=Action57DMask(right_wrist=True, left_wrist=True),
|
| 160 |
-
gripper_right=action[:, 19].astype(np.float32),
|
| 161 |
-
gripper_left=action[:, 9].astype(np.float32),
|
| 162 |
-
)
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
def to_unified_from_9d(action: np.ndarray) -> UnifiedAction:
|
| 166 |
-
"""9D camera/AV ``[pos(3)+rot6d(6)]`` → ego only."""
|
| 167 |
-
T = action.shape[0]
|
| 168 |
-
a = np.zeros((T, 57), dtype=np.float32) # [T,57]
|
| 169 |
-
a[:, 0:9] = action[:, :9]
|
| 170 |
-
return UnifiedAction(
|
| 171 |
-
action=a,
|
| 172 |
-
mask=Action57DMask(ego=True),
|
| 173 |
)
|
| 174 |
|
| 175 |
|
| 176 |
def _validate_action_shape(action: np.ndarray, action_format: ActionFormat) -> None:
|
| 177 |
-
"""Raise when a raw action tensor does not match its declared format."""
|
| 178 |
if action.ndim != 2:
|
| 179 |
raise ValueError(f"Expected a rank-2 action array, got shape {action.shape}")
|
| 180 |
-
actual_dim = int(action.shape[-1])
|
| 181 |
expected_dim = action_format.expected_dim
|
|
|
|
| 182 |
if actual_dim != expected_dim:
|
| 183 |
raise ValueError(f"Action format {action_format.value} expects trailing dim {expected_dim}, got {actual_dim}")
|
| 184 |
|
| 185 |
|
| 186 |
def to_unified(action: np.ndarray, action_format: ActionFormat) -> UnifiedAction:
|
| 187 |
-
"""Convert one
|
| 188 |
_validate_action_shape(action, action_format)
|
| 189 |
-
if action_format is ActionFormat.UNIFIED_57D:
|
| 190 |
-
return to_unified_from_57d(action)
|
| 191 |
-
if action_format is ActionFormat.DUAL_ARM_20D:
|
| 192 |
-
return to_unified_from_20d(action)
|
| 193 |
if action_format is ActionFormat.EGO_9D:
|
| 194 |
return to_unified_from_9d(action)
|
| 195 |
if action_format is ActionFormat.SINGLE_ARM_10D:
|
| 196 |
return to_unified_from_10d(action)
|
|
|
|
|
|
|
| 197 |
raise ValueError(f"Unsupported action format: {action_format}")
|
| 198 |
|
| 199 |
|
| 200 |
def _pos_rot6d_to_mat(se3: np.ndarray) -> np.ndarray:
|
| 201 |
"""Convert ``(N, 9)`` pos+rot6d to ``(N, 4, 4)`` SE(3) matrices."""
|
| 202 |
-
|
| 203 |
pos = se3[:, :3]
|
| 204 |
r6 = se3[:, 3:9]
|
| 205 |
-
|
| 206 |
col0 = r6[:, :3].copy()
|
| 207 |
-
|
| 208 |
-
col0 = col0 / col0_norm
|
| 209 |
-
|
| 210 |
col1 = r6[:, 3:6] - np.sum(r6[:, 3:6] * col0, axis=-1, keepdims=True) * col0
|
| 211 |
-
|
| 212 |
-
col1 = col1 / col1_norm
|
| 213 |
-
|
| 214 |
col2 = np.cross(col0, col1)
|
| 215 |
-
|
| 216 |
-
mats = np.tile(np.eye(4, dtype=np.float32), (N, 1, 1))
|
| 217 |
mats[:, :3, 0] = col0
|
| 218 |
mats[:, :3, 1] = col1
|
| 219 |
mats[:, :3, 2] = col2
|
|
@@ -226,255 +146,77 @@ def _chain_se3(
|
|
| 226 |
initial_pose: np.ndarray | None = None,
|
| 227 |
pose_convention: str = "backward_framewise",
|
| 228 |
) -> np.ndarray:
|
| 229 |
-
"""Chain ``(T, 9)`` relative deltas into ``(T+1, 4, 4)`` absolute poses.
|
| 230 |
-
|
| 231 |
-
For ``backward_framewise``: ``P_{t+1} = P_t @ delta_t``.
|
| 232 |
-
For ``absolute``: each row is already an absolute pose (no chaining).
|
| 233 |
-
"""
|
| 234 |
-
T = deltas.shape[0]
|
| 235 |
delta_mats = _pos_rot6d_to_mat(deltas)
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
else:
|
| 240 |
-
initial_pose = initial_pose.astype(np.float32)
|
| 241 |
-
|
| 242 |
-
poses = np.empty((T + 1, 4, 4), dtype=np.float32)
|
| 243 |
-
poses[0] = initial_pose
|
| 244 |
-
|
| 245 |
if pose_convention == "absolute":
|
| 246 |
poses[1:] = delta_mats
|
| 247 |
else:
|
| 248 |
-
for t in range(
|
| 249 |
poses[t + 1] = poses[t] @ delta_mats[t]
|
| 250 |
-
|
| 251 |
return poses
|
| 252 |
|
| 253 |
|
| 254 |
-
def
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
return np.concatenate([fingers[:1], fingers], axis=0)
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
def _to_numpy_float32(value: object) -> np.ndarray:
|
| 262 |
-
"""Convert a tensor-like value to a float32 NumPy array."""
|
| 263 |
-
|
| 264 |
-
if isinstance(value, torch.Tensor):
|
| 265 |
-
return value.detach().cpu().numpy().astype(np.float32)
|
| 266 |
-
return np.asarray(value, dtype=np.float32)
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
def _quat_xyzw_to_rotmat(q: np.ndarray) -> np.ndarray:
|
| 270 |
-
"""Convert ``(N, 4)`` xyzw quaternions to ``(N, 3, 3)`` rotation matrices."""
|
| 271 |
-
x, y, z, w = q[:, 0], q[:, 1], q[:, 2], q[:, 3]
|
| 272 |
-
R = np.zeros((len(q), 3, 3), dtype=np.float32)
|
| 273 |
-
R[:, 0, 0] = 1 - 2 * (y * y + z * z)
|
| 274 |
-
R[:, 0, 1] = 2 * (x * y - z * w)
|
| 275 |
-
R[:, 0, 2] = 2 * (x * z + y * w)
|
| 276 |
-
R[:, 1, 0] = 2 * (x * y + z * w)
|
| 277 |
-
R[:, 1, 1] = 1 - 2 * (x * x + z * z)
|
| 278 |
-
R[:, 1, 2] = 2 * (y * z - x * w)
|
| 279 |
-
R[:, 2, 0] = 2 * (x * z - y * w)
|
| 280 |
-
R[:, 2, 1] = 2 * (y * z + x * w)
|
| 281 |
-
R[:, 2, 2] = 1 - 2 * (x * x + y * y)
|
| 282 |
-
return R
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
def _build_absolute_from_overlay(sample: dict) -> dict[str, np.ndarray] | None:
|
| 286 |
-
"""Build absolute world-frame poses from HandPoseDataset overlay data.
|
| 287 |
-
|
| 288 |
-
Returns None if overlay keys are missing.
|
| 289 |
-
"""
|
| 290 |
-
raw_cam_pos = sample.get("raw_cam_position")
|
| 291 |
-
if raw_cam_pos is None:
|
| 292 |
-
return None
|
| 293 |
-
|
| 294 |
-
cam_pos = _to_numpy_float32(raw_cam_pos) # [T+1,3]
|
| 295 |
-
cam_rot_q = _to_numpy_float32(sample["raw_cam_rotation"]) # [T+1,4]
|
| 296 |
-
right_3d = _to_numpy_float32(sample["raw_cam_right_3d"]) # [T+1,63]
|
| 297 |
-
left_3d = _to_numpy_float32(sample["raw_cam_left_3d"]) # [T+1,63]
|
| 298 |
-
right_rot = _to_numpy_float32(sample["raw_cam_right_rot"]) # [T+1,84]
|
| 299 |
-
left_rot = _to_numpy_float32(sample["raw_cam_left_rot"]) # [T+1,84]
|
| 300 |
-
|
| 301 |
-
T1 = cam_pos.shape[0]
|
| 302 |
-
FTIP = [4, 8, 12, 16, 20]
|
| 303 |
-
|
| 304 |
-
# Camera c2w (world frame)
|
| 305 |
-
cam_c2w = np.tile(np.eye(4, dtype=np.float32), (T1, 1, 1)) # [T+1,4,4]
|
| 306 |
-
cam_c2w[:, :3, 3] = cam_pos
|
| 307 |
-
cam_c2w[:, :3, :3] = _quat_xyzw_to_rotmat(cam_rot_q)
|
| 308 |
-
|
| 309 |
-
def _wrist_world(pos_63, rot_84):
|
| 310 |
-
wrist_pos = pos_63[:, :3]
|
| 311 |
-
wrist_q = rot_84.reshape(T1, 21, 4)[:, 0]
|
| 312 |
-
wrist_cam = np.tile(np.eye(4, dtype=np.float32), (T1, 1, 1)) # [T+1,4,4]
|
| 313 |
-
wrist_cam[:, :3, 3] = wrist_pos
|
| 314 |
-
wrist_cam[:, :3, :3] = _quat_xyzw_to_rotmat(wrist_q)
|
| 315 |
-
return cam_c2w @ wrist_cam
|
| 316 |
-
|
| 317 |
-
def _fingers_world(pos_63):
|
| 318 |
-
joints = pos_63.reshape(T1, 21, 3)[:, FTIP]
|
| 319 |
-
R = cam_c2w[:, :3, :3]
|
| 320 |
-
t = cam_c2w[:, :3, 3]
|
| 321 |
-
return np.einsum("tij,tfj->tfi", R, joints) + t[:, None, :] # [T+1,5,3]
|
| 322 |
-
|
| 323 |
-
return {
|
| 324 |
-
"ego_poses": cam_c2w,
|
| 325 |
-
"right_wrist_poses": _wrist_world(right_3d, right_rot),
|
| 326 |
-
"left_wrist_poses": _wrist_world(left_3d, left_rot),
|
| 327 |
-
"right_fingers": _fingers_world(right_3d),
|
| 328 |
-
"left_fingers": _fingers_world(left_3d),
|
| 329 |
-
}
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
def _build_libero_absolute_from_state(sample: dict) -> dict[str, np.ndarray] | None:
|
| 333 |
-
"""Build absolute LIBERO right-wrist poses from ``observation.state``.
|
| 334 |
-
|
| 335 |
-
LIBERO policy actions are normalized robosuite controller commands rather
|
| 336 |
-
than meter-scale SE(3) deltas. For visualization, the raw state sequence is
|
| 337 |
-
the correct source of metric end-effector poses.
|
| 338 |
-
"""
|
| 339 |
-
if sample.get("source_repo_id") is None or sample.get("state") is None:
|
| 340 |
-
return None
|
| 341 |
-
if "libero" not in str(sample.get("source_repo_id", "")).lower():
|
| 342 |
-
return None
|
| 343 |
-
|
| 344 |
-
from cosmos_framework.data.vfm.action.libero_pose_utils import build_libero_abs_pose
|
| 345 |
-
|
| 346 |
-
to_opencv = str(sample.get("pose_coordinate_frame", "native")) == "opencv"
|
| 347 |
-
right_poses = build_libero_abs_pose(sample["state"], to_opencv=to_opencv)
|
| 348 |
-
return {"right_wrist_poses": right_poses.astype(np.float32, copy=False)}
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
# ─── Scene State Builder ─────────────────────────────────────────────────────
|
| 352 |
|
| 353 |
|
| 354 |
def build_scene_state(
|
| 355 |
unified: UnifiedAction,
|
|
|
|
| 356 |
initial_pose: np.ndarray | None = None,
|
| 357 |
initial_pose_right: np.ndarray | None = None,
|
| 358 |
initial_pose_left: np.ndarray | None = None,
|
| 359 |
right_base_pose: np.ndarray | None = None,
|
| 360 |
left_base_pose: np.ndarray | None = None,
|
| 361 |
pose_convention: str = "backward_framewise",
|
| 362 |
-
sample: dict | None = None,
|
| 363 |
) -> SceneState:
|
| 364 |
-
"""Reconstruct
|
| 365 |
-
|
| 366 |
-
Chains SE(3) deltas for valid mask slots. If ``sample`` contains overlay
|
| 367 |
-
data (HandPoseDataset raw camera/joint fields), overrides with absolute
|
| 368 |
-
world-frame poses.
|
| 369 |
-
|
| 370 |
-
Args:
|
| 371 |
-
unified: Canonical 57D action with mask.
|
| 372 |
-
initial_pose: Default initial pose for all slots.
|
| 373 |
-
initial_pose_right: Override for right wrist (dual arm).
|
| 374 |
-
initial_pose_left: Override for left wrist (dual arm).
|
| 375 |
-
right_base_pose: Right-arm base pose that maps arm-local trajectories into ``scene_world``.
|
| 376 |
-
left_base_pose: Left-arm base pose that maps arm-local trajectories into ``scene_world``.
|
| 377 |
-
pose_convention: Pose convention for SE(3) chaining.
|
| 378 |
-
sample: Raw dataset sample (for overlay data).
|
| 379 |
-
"""
|
| 380 |
-
|
| 381 |
-
def _apply_pose_base(poses: np.ndarray | None, base_pose: np.ndarray | None) -> np.ndarray | None:
|
| 382 |
-
if poses is None or base_pose is None:
|
| 383 |
-
return poses
|
| 384 |
-
return np.einsum("ij,njk->nik", base_pose, poses).astype(np.float32) # [T+1,4,4]
|
| 385 |
-
|
| 386 |
-
def _fingers_local_to_world(
|
| 387 |
-
fingers_local: np.ndarray | None,
|
| 388 |
-
wrist_poses_world: np.ndarray | None,
|
| 389 |
-
) -> np.ndarray | None:
|
| 390 |
-
if fingers_local is None:
|
| 391 |
-
return None
|
| 392 |
-
if wrist_poses_world is None:
|
| 393 |
-
raise ValueError("Finger trajectories require matching wrist poses to build world-space SceneState")
|
| 394 |
-
wrist_rot = wrist_poses_world[:, :3, :3].astype(np.float32) # [T+1,3,3]
|
| 395 |
-
wrist_pos = wrist_poses_world[:, :3, 3].astype(np.float32) # [T+1,3]
|
| 396 |
-
return np.einsum("tij,tfj->tfi", wrist_rot, fingers_local) + wrist_pos[:, None, :] # [T+1,5,3]
|
| 397 |
-
|
| 398 |
-
mask = unified.mask
|
| 399 |
action = unified.action
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
ip_default = initial_pose if initial_pose is not None else np.eye(4, dtype=np.float32)
|
| 403 |
-
ip_right = initial_pose_right if initial_pose_right is not None else ip_default
|
| 404 |
-
ip_left = initial_pose_left if initial_pose_left is not None else ip_default
|
| 405 |
-
|
| 406 |
if mask.ego:
|
| 407 |
-
state.ego_poses = _chain_se3(action[:, 0:9],
|
| 408 |
if mask.right_wrist:
|
| 409 |
-
state.right_poses = _chain_se3(
|
| 410 |
-
|
| 411 |
-
|
|
|
|
|
|
|
| 412 |
if mask.left_wrist:
|
| 413 |
-
state.left_poses = _chain_se3(
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
if unified.gripper_right is not None:
|
| 418 |
-
g = unified.gripper_right
|
| 419 |
-
state.gripper_right = np.concatenate([[g[0]], g]).astype(np.float32, copy=False) # [T+1]
|
| 420 |
-
if unified.gripper_left is not None:
|
| 421 |
-
g = unified.gripper_left
|
| 422 |
-
state.gripper_left = np.concatenate([[g[0]], g]).astype(np.float32, copy=False) # [T+1]
|
| 423 |
-
|
| 424 |
-
abs_data = _build_absolute_from_overlay(sample) if sample is not None else None
|
| 425 |
-
if abs_data is not None:
|
| 426 |
-
state.ego_poses = abs_data["ego_poses"]
|
| 427 |
-
state.right_poses = abs_data["right_wrist_poses"]
|
| 428 |
-
state.left_poses = abs_data["left_wrist_poses"]
|
| 429 |
-
state.right_fingers = abs_data["right_fingers"]
|
| 430 |
-
state.left_fingers = abs_data["left_fingers"]
|
| 431 |
-
log.info(
|
| 432 |
-
f"Overlay absolute mode | ego range: "
|
| 433 |
-
f"[{abs_data['ego_poses'][:, :3, 3].min():.3f}, "
|
| 434 |
-
f"{abs_data['ego_poses'][:, :3, 3].max():.3f}] | "
|
| 435 |
-
f"R wrist[0]: {abs_data['right_wrist_poses'][0, :3, 3]}"
|
| 436 |
)
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
libero_abs_data = _build_libero_absolute_from_state(sample) if sample is not None else None
|
| 442 |
-
if libero_abs_data is not None:
|
| 443 |
-
state.right_poses = libero_abs_data["right_wrist_poses"]
|
| 444 |
-
|
| 445 |
-
state.right_fingers = _fingers_local_to_world(state.right_fingers, state.right_poses)
|
| 446 |
-
state.left_fingers = _fingers_local_to_world(state.left_fingers, state.left_poses)
|
| 447 |
-
|
| 448 |
-
state.action_raw = unified.action.astype(np.float32)
|
| 449 |
state.T = action.shape[0]
|
| 450 |
return state
|
| 451 |
|
| 452 |
|
| 453 |
-
# ─── Video Extraction ─────────────────────────────────────────────────────────
|
| 454 |
-
|
| 455 |
-
|
| 456 |
def get_video_from_sample(sample: dict) -> np.ndarray | None:
|
| 457 |
-
"""Extract video frames from a dataset sample.
|
| 458 |
-
|
| 459 |
-
Returns ``(T+1, H, W, 3)`` uint8 array, or None.
|
| 460 |
-
"""
|
| 461 |
video = sample.get("video")
|
| 462 |
if video is None:
|
| 463 |
return None
|
| 464 |
if isinstance(video, torch.Tensor):
|
| 465 |
video = video.numpy()
|
| 466 |
-
|
| 467 |
if video.ndim == 4:
|
| 468 |
-
|
| 469 |
-
if
|
| 470 |
video = np.transpose(video, (1, 2, 3, 0))
|
| 471 |
-
elif video.shape[1] in (1, 3) and
|
| 472 |
video = np.transpose(video, (0, 2, 3, 1))
|
| 473 |
-
|
| 474 |
if video.dtype in (np.float32, np.float64):
|
| 475 |
video = np.clip(video * 255, 0, 255).astype(np.uint8)
|
| 476 |
-
|
| 477 |
if video.ndim == 4 and video.shape[-1] == 1:
|
| 478 |
video = np.repeat(video, 3, axis=-1)
|
| 479 |
-
|
| 480 |
return video
|
|
|
|
| 1 |
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
# SPDX-License-Identifier: OpenMDW-1.1
|
| 3 |
|
| 4 |
+
"""Raw action converters for the standalone action viewer.
|
| 5 |
|
| 6 |
+
The Space release is raw-action-only. It accepts 9D ego/camera actions, 10D
|
| 7 |
+
single-arm actions, and 20D dual-arm actions. The renderer still uses a fixed
|
| 8 |
+
57D internal canvas so the downstream drawing code can stay format-agnostic,
|
| 9 |
+
but hand-pose/57D datasets are intentionally not part of this release.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
|
| 12 |
from __future__ import annotations
|
| 13 |
|
| 14 |
+
from dataclasses import dataclass
|
| 15 |
from enum import Enum
|
| 16 |
from typing import Any
|
| 17 |
|
| 18 |
import numpy as np
|
| 19 |
import torch
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
NO_FINGERS = (False, False, False, False, False)
|
| 22 |
|
| 23 |
|
|
|
|
| 27 |
EGO_9D = "9d"
|
| 28 |
SINGLE_ARM_10D = "10d"
|
| 29 |
DUAL_ARM_20D = "20d"
|
|
|
|
| 30 |
|
| 31 |
@property
|
| 32 |
def expected_dim(self) -> int:
|
| 33 |
+
return {self.EGO_9D: 9, self.SINGLE_ARM_10D: 10, self.DUAL_ARM_20D: 20}[self]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
@dataclass
|
| 37 |
class Action57DMask:
|
| 38 |
+
"""Per-component validity for the internal 57D rendering canvas."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
ego: bool = False
|
| 41 |
right_wrist: bool = False
|
|
|
|
| 46 |
|
| 47 |
@dataclass
|
| 48 |
class UnifiedAction:
|
| 49 |
+
"""Internal 57D action plus scalar gripper side channels."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
+
action: np.ndarray
|
| 52 |
mask: Action57DMask
|
| 53 |
+
gripper_right: np.ndarray | None = None
|
| 54 |
+
gripper_left: np.ndarray | None = None
|
| 55 |
|
| 56 |
|
| 57 |
@dataclass
|
| 58 |
class SceneState:
|
| 59 |
+
"""Renderer-ready reconstructed trajectories for one sample."""
|
| 60 |
|
| 61 |
+
mask: Action57DMask
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
ego_poses: np.ndarray | None = None
|
| 63 |
right_poses: np.ndarray | None = None
|
| 64 |
left_poses: np.ndarray | None = None
|
|
|
|
| 65 |
right_fingers: np.ndarray | None = None
|
| 66 |
left_fingers: np.ndarray | None = None
|
|
|
|
| 67 |
gripper_right: np.ndarray | None = None
|
| 68 |
gripper_left: np.ndarray | None = None
|
| 69 |
+
video: np.ndarray | None = None
|
| 70 |
+
action_raw: np.ndarray | None = None
|
|
|
|
| 71 |
T: int = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
|
|
|
| 73 |
|
| 74 |
+
def to_unified_from_9d(action: np.ndarray) -> UnifiedAction:
|
| 75 |
+
"""9D camera/AV ``[pos(3)+rot6d(6)]`` → ego only."""
|
| 76 |
+
a = np.zeros((action.shape[0], 57), dtype=np.float32)
|
| 77 |
+
a[:, 0:9] = action[:, :9]
|
| 78 |
+
return UnifiedAction(action=a, mask=Action57DMask(ego=True))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
|
| 81 |
def to_unified_from_10d(action: np.ndarray) -> UnifiedAction:
|
| 82 |
"""10D single arm ``[pos(3)+rot6d(6)+grip(1)]`` → right wrist + gripper."""
|
| 83 |
+
a = np.zeros((action.shape[0], 57), dtype=np.float32)
|
|
|
|
| 84 |
a[:, 9:18] = action[:, :9]
|
| 85 |
return UnifiedAction(
|
| 86 |
action=a,
|
|
|
|
| 90 |
|
| 91 |
|
| 92 |
def to_unified_from_20d(action: np.ndarray) -> UnifiedAction:
|
| 93 |
+
"""20D dual arm ``[left(10) | right(10)]`` → both wrists + both grippers."""
|
| 94 |
+
a = np.zeros((action.shape[0], 57), dtype=np.float32)
|
| 95 |
+
a[:, 33:42] = action[:, :9]
|
| 96 |
+
a[:, 9:18] = action[:, 10:19]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
return UnifiedAction(
|
| 98 |
action=a,
|
| 99 |
mask=Action57DMask(right_wrist=True, left_wrist=True),
|
| 100 |
+
gripper_right=action[:, 19].astype(np.float32),
|
| 101 |
+
gripper_left=action[:, 9].astype(np.float32),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
)
|
| 103 |
|
| 104 |
|
| 105 |
def _validate_action_shape(action: np.ndarray, action_format: ActionFormat) -> None:
|
|
|
|
| 106 |
if action.ndim != 2:
|
| 107 |
raise ValueError(f"Expected a rank-2 action array, got shape {action.shape}")
|
|
|
|
| 108 |
expected_dim = action_format.expected_dim
|
| 109 |
+
actual_dim = int(action.shape[-1])
|
| 110 |
if actual_dim != expected_dim:
|
| 111 |
raise ValueError(f"Action format {action_format.value} expects trailing dim {expected_dim}, got {actual_dim}")
|
| 112 |
|
| 113 |
|
| 114 |
def to_unified(action: np.ndarray, action_format: ActionFormat) -> UnifiedAction:
|
| 115 |
+
"""Convert one supported raw action format into the internal render canvas."""
|
| 116 |
_validate_action_shape(action, action_format)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
if action_format is ActionFormat.EGO_9D:
|
| 118 |
return to_unified_from_9d(action)
|
| 119 |
if action_format is ActionFormat.SINGLE_ARM_10D:
|
| 120 |
return to_unified_from_10d(action)
|
| 121 |
+
if action_format is ActionFormat.DUAL_ARM_20D:
|
| 122 |
+
return to_unified_from_20d(action)
|
| 123 |
raise ValueError(f"Unsupported action format: {action_format}")
|
| 124 |
|
| 125 |
|
| 126 |
def _pos_rot6d_to_mat(se3: np.ndarray) -> np.ndarray:
|
| 127 |
"""Convert ``(N, 9)`` pos+rot6d to ``(N, 4, 4)`` SE(3) matrices."""
|
| 128 |
+
n = se3.shape[0]
|
| 129 |
pos = se3[:, :3]
|
| 130 |
r6 = se3[:, 3:9]
|
|
|
|
| 131 |
col0 = r6[:, :3].copy()
|
| 132 |
+
col0 = col0 / (np.linalg.norm(col0, axis=-1, keepdims=True) + 1e-8)
|
|
|
|
|
|
|
| 133 |
col1 = r6[:, 3:6] - np.sum(r6[:, 3:6] * col0, axis=-1, keepdims=True) * col0
|
| 134 |
+
col1 = col1 / (np.linalg.norm(col1, axis=-1, keepdims=True) + 1e-8)
|
|
|
|
|
|
|
| 135 |
col2 = np.cross(col0, col1)
|
| 136 |
+
mats = np.tile(np.eye(4, dtype=np.float32), (n, 1, 1))
|
|
|
|
| 137 |
mats[:, :3, 0] = col0
|
| 138 |
mats[:, :3, 1] = col1
|
| 139 |
mats[:, :3, 2] = col2
|
|
|
|
| 146 |
initial_pose: np.ndarray | None = None,
|
| 147 |
pose_convention: str = "backward_framewise",
|
| 148 |
) -> np.ndarray:
|
| 149 |
+
"""Chain ``(T, 9)`` relative deltas into ``(T+1, 4, 4)`` absolute poses."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
delta_mats = _pos_rot6d_to_mat(deltas)
|
| 151 |
+
initial = np.eye(4, dtype=np.float32) if initial_pose is None else initial_pose.astype(np.float32)
|
| 152 |
+
poses = np.empty((deltas.shape[0] + 1, 4, 4), dtype=np.float32)
|
| 153 |
+
poses[0] = initial
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
if pose_convention == "absolute":
|
| 155 |
poses[1:] = delta_mats
|
| 156 |
else:
|
| 157 |
+
for t in range(deltas.shape[0]):
|
| 158 |
poses[t + 1] = poses[t] @ delta_mats[t]
|
|
|
|
| 159 |
return poses
|
| 160 |
|
| 161 |
|
| 162 |
+
def _apply_pose_base(poses: np.ndarray | None, base_pose: np.ndarray | None) -> np.ndarray | None:
|
| 163 |
+
if poses is None or base_pose is None:
|
| 164 |
+
return poses
|
| 165 |
+
return np.einsum("ij,tjk->tik", base_pose.astype(np.float32), poses).astype(np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
|
| 168 |
def build_scene_state(
|
| 169 |
unified: UnifiedAction,
|
| 170 |
+
*,
|
| 171 |
initial_pose: np.ndarray | None = None,
|
| 172 |
initial_pose_right: np.ndarray | None = None,
|
| 173 |
initial_pose_left: np.ndarray | None = None,
|
| 174 |
right_base_pose: np.ndarray | None = None,
|
| 175 |
left_base_pose: np.ndarray | None = None,
|
| 176 |
pose_convention: str = "backward_framewise",
|
| 177 |
+
sample: dict[str, Any] | None = None,
|
| 178 |
) -> SceneState:
|
| 179 |
+
"""Reconstruct viewer trajectories from raw-action-derived ``UnifiedAction``."""
|
| 180 |
+
del sample
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
action = unified.action
|
| 182 |
+
mask = unified.mask
|
| 183 |
+
state = SceneState(mask=mask, gripper_right=unified.gripper_right, gripper_left=unified.gripper_left)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
if mask.ego:
|
| 185 |
+
state.ego_poses = _chain_se3(action[:, 0:9], initial_pose, pose_convention)
|
| 186 |
if mask.right_wrist:
|
| 187 |
+
state.right_poses = _chain_se3(
|
| 188 |
+
action[:, 9:18],
|
| 189 |
+
initial_pose_right if initial_pose_right is not None else initial_pose,
|
| 190 |
+
pose_convention,
|
| 191 |
+
)
|
| 192 |
if mask.left_wrist:
|
| 193 |
+
state.left_poses = _chain_se3(
|
| 194 |
+
action[:, 33:42],
|
| 195 |
+
initial_pose_left if initial_pose_left is not None else initial_pose,
|
| 196 |
+
pose_convention,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
)
|
| 198 |
+
state.right_poses = _apply_pose_base(state.right_poses, right_base_pose)
|
| 199 |
+
state.left_poses = _apply_pose_base(state.left_poses, left_base_pose)
|
| 200 |
+
state.action_raw = action.astype(np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
state.T = action.shape[0]
|
| 202 |
return state
|
| 203 |
|
| 204 |
|
|
|
|
|
|
|
|
|
|
| 205 |
def get_video_from_sample(sample: dict) -> np.ndarray | None:
|
| 206 |
+
"""Extract video frames from a dataset sample as ``(T+1, H, W, 3)`` uint8."""
|
|
|
|
|
|
|
|
|
|
| 207 |
video = sample.get("video")
|
| 208 |
if video is None:
|
| 209 |
return None
|
| 210 |
if isinstance(video, torch.Tensor):
|
| 211 |
video = video.numpy()
|
|
|
|
| 212 |
if video.ndim == 4:
|
| 213 |
+
c, t_dim, _, _ = video.shape
|
| 214 |
+
if c in (1, 3) and t_dim > 3:
|
| 215 |
video = np.transpose(video, (1, 2, 3, 0))
|
| 216 |
+
elif video.shape[1] in (1, 3) and t_dim <= 3:
|
| 217 |
video = np.transpose(video, (0, 2, 3, 1))
|
|
|
|
| 218 |
if video.dtype in (np.float32, np.float64):
|
| 219 |
video = np.clip(video * 255, 0, 255).astype(np.uint8)
|
|
|
|
| 220 |
if video.ndim == 4 and video.shape[-1] == 1:
|
| 221 |
video = np.repeat(video, 3, axis=-1)
|
|
|
|
| 222 |
return video
|
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_renderer.py
CHANGED
|
@@ -15,7 +15,7 @@ import numpy as np
|
|
| 15 |
|
| 16 |
from cosmos_framework.utils import log
|
| 17 |
from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import RobotSceneModel
|
| 18 |
-
from cosmos_framework.data.vfm.action.urdf_visualizer.unified_action import
|
| 19 |
|
| 20 |
|
| 21 |
class UnifiedRenderer:
|
|
@@ -41,13 +41,7 @@ class UnifiedRenderer:
|
|
| 41 |
COLOR_EGO_TOP = (231, 76, 60) # red
|
| 42 |
COLOR_RIGHT = (243, 156, 18) # orange
|
| 43 |
COLOR_LEFT = (155, 89, 182) # purple
|
| 44 |
-
|
| 45 |
-
(231, 76, 60),
|
| 46 |
-
(241, 196, 15),
|
| 47 |
-
(46, 204, 113),
|
| 48 |
-
(52, 152, 219),
|
| 49 |
-
(155, 89, 182),
|
| 50 |
-
]
|
| 51 |
|
| 52 |
@staticmethod
|
| 53 |
def _soften_color(color: tuple[int, int, int], mix: float = 0.35) -> tuple[int, int, int]:
|
|
@@ -135,23 +129,14 @@ class UnifiedRenderer:
|
|
| 135 |
point_size=0.015,
|
| 136 |
point_shape="circle",
|
| 137 |
)
|
| 138 |
-
self.right_fingers = [
|
| 139 |
-
server.scene.add_icosphere(
|
| 140 |
-
self._p(f"/right/finger_{FINGER_NAMES[i]}"),
|
| 141 |
-
radius=self.TIP_RADIUS,
|
| 142 |
-
color=self.FINGER_COLORS[i],
|
| 143 |
-
position=(0.0, 0.0, 0.0),
|
| 144 |
-
)
|
| 145 |
-
for i in range(5)
|
| 146 |
-
]
|
| 147 |
self.right_gripper_tips = [
|
| 148 |
server.scene.add_icosphere(
|
| 149 |
self._p(f"/right/gripper_tip_{side}"),
|
| 150 |
radius=self.TIP_RADIUS,
|
| 151 |
-
color=self.
|
| 152 |
position=(0.0, 0.0, 0.0),
|
| 153 |
)
|
| 154 |
-
for i, side in enumerate(("
|
| 155 |
]
|
| 156 |
|
| 157 |
# ── Left effector ──
|
|
@@ -171,23 +156,14 @@ class UnifiedRenderer:
|
|
| 171 |
point_size=0.015,
|
| 172 |
point_shape="circle",
|
| 173 |
)
|
| 174 |
-
self.left_fingers = [
|
| 175 |
-
server.scene.add_icosphere(
|
| 176 |
-
self._p(f"/left/finger_{FINGER_NAMES[i]}"),
|
| 177 |
-
radius=self.TIP_RADIUS,
|
| 178 |
-
color=self.FINGER_COLORS[i],
|
| 179 |
-
position=(0.0, 0.0, 0.0),
|
| 180 |
-
)
|
| 181 |
-
for i in range(5)
|
| 182 |
-
]
|
| 183 |
self.left_gripper_tips = [
|
| 184 |
server.scene.add_icosphere(
|
| 185 |
self._p(f"/left/gripper_tip_{side}"),
|
| 186 |
radius=self.TIP_RADIUS,
|
| 187 |
-
color=self.
|
| 188 |
position=(0.0, 0.0, 0.0),
|
| 189 |
)
|
| 190 |
-
for i, side in enumerate(("
|
| 191 |
]
|
| 192 |
|
| 193 |
# ── IK robot meshes ──
|
|
@@ -288,13 +264,6 @@ class UnifiedRenderer:
|
|
| 288 |
self.right_traj,
|
| 289 |
show,
|
| 290 |
)
|
| 291 |
-
self._update_fingers(
|
| 292 |
-
t,
|
| 293 |
-
state.right_fingers,
|
| 294 |
-
mask.right_fingers,
|
| 295 |
-
self.right_fingers,
|
| 296 |
-
show,
|
| 297 |
-
)
|
| 298 |
self._update_gripper(
|
| 299 |
t,
|
| 300 |
state.right_poses,
|
|
@@ -315,13 +284,6 @@ class UnifiedRenderer:
|
|
| 315 |
self.left_traj,
|
| 316 |
show,
|
| 317 |
)
|
| 318 |
-
self._update_fingers(
|
| 319 |
-
t,
|
| 320 |
-
state.left_fingers,
|
| 321 |
-
mask.left_fingers,
|
| 322 |
-
self.left_fingers,
|
| 323 |
-
show,
|
| 324 |
-
)
|
| 325 |
self._update_gripper(
|
| 326 |
t,
|
| 327 |
state.left_poses,
|
|
@@ -364,9 +326,6 @@ class UnifiedRenderer:
|
|
| 364 |
def _v(active):
|
| 365 |
return "✓" if active else "·"
|
| 366 |
|
| 367 |
-
gr = a[18:33].reshape(5, 3)
|
| 368 |
-
gl = a[42:57].reshape(5, 3)
|
| 369 |
-
|
| 370 |
# Gripper auxiliary values (not in 57D vector)
|
| 371 |
grip_r_str = ""
|
| 372 |
grip_l_str = ""
|
|
@@ -383,10 +342,7 @@ class UnifiedRenderer:
|
|
| 383 |
"",
|
| 384 |
f"{_v(mask.right_wrist)} R wrist pos [9:12] {_fmt(a[9:12])}",
|
| 385 |
f" {' ' * 1} rot [12:18] {_fmt(a[12:18])}",
|
| 386 |
-
f" R fingers [18:33]",
|
| 387 |
]
|
| 388 |
-
for i, name in enumerate(FINGER_NAMES):
|
| 389 |
-
parts.append(f" {_v(mask.right_fingers[i])} {name:7s} {_fmt(gr[i])}")
|
| 390 |
if grip_r_str:
|
| 391 |
parts.append(grip_r_str)
|
| 392 |
|
|
@@ -394,10 +350,7 @@ class UnifiedRenderer:
|
|
| 394 |
"",
|
| 395 |
f"{_v(mask.left_wrist)} L wrist pos [33:36] {_fmt(a[33:36])}",
|
| 396 |
f" {' ' * 1} rot [36:42] {_fmt(a[36:42])}",
|
| 397 |
-
f" L fingers [42:57]",
|
| 398 |
]
|
| 399 |
-
for i, name in enumerate(FINGER_NAMES):
|
| 400 |
-
parts.append(f" {_v(mask.left_fingers[i])} {name:7s} {_fmt(gl[i])}")
|
| 401 |
if grip_l_str:
|
| 402 |
parts.append(grip_l_str)
|
| 403 |
|
|
@@ -476,26 +429,6 @@ class UnifiedRenderer:
|
|
| 476 |
ee.visible = False
|
| 477 |
traj.visible = False
|
| 478 |
|
| 479 |
-
# ─── Private: Fingers ─────────────────────────────────────────────────────
|
| 480 |
-
|
| 481 |
-
def _update_fingers(self, t, fingers, finger_mask, handles, show):
|
| 482 |
-
if fingers is None or t >= len(fingers):
|
| 483 |
-
for h in handles:
|
| 484 |
-
h.visible = False
|
| 485 |
-
return
|
| 486 |
-
if not show.get("fingertips", True):
|
| 487 |
-
for h in handles:
|
| 488 |
-
h.visible = False
|
| 489 |
-
return
|
| 490 |
-
|
| 491 |
-
g = fingers[t] # (5, 3)
|
| 492 |
-
for fi, h in enumerate(handles):
|
| 493 |
-
if finger_mask[fi]:
|
| 494 |
-
h.position = g[fi].astype(np.float32)
|
| 495 |
-
h.visible = True
|
| 496 |
-
else:
|
| 497 |
-
h.visible = False
|
| 498 |
-
|
| 499 |
# ─── Private: Gripper ─────────────────────────────────────────────────────
|
| 500 |
|
| 501 |
def _update_gripper(self, t, poses, gripper, wrist_active, finger_mask, handle, show):
|
|
@@ -702,27 +635,6 @@ class UnifiedRenderer:
|
|
| 702 |
if self._robot_scene_model is None:
|
| 703 |
return
|
| 704 |
|
| 705 |
-
# Joint-position datasets (e.g. robomind-ur): bypass IK, use FK directly
|
| 706 |
-
if state.joint_configs is not None:
|
| 707 |
-
from cosmos_framework.data.vfm.action.urdf_visualizer.ik_solver import compute_mujoco_geom_transforms
|
| 708 |
-
from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import get_mjcf_path
|
| 709 |
-
|
| 710 |
-
try:
|
| 711 |
-
mjcf_path = get_mjcf_path(entry.robot_name)
|
| 712 |
-
transforms, _, _fk_ee_poses, robot_frames = compute_mujoco_geom_transforms(
|
| 713 |
-
mjcf_path, state.joint_configs
|
| 714 |
-
)
|
| 715 |
-
self._ik_right = transforms
|
| 716 |
-
self._robot_frames_right = robot_frames
|
| 717 |
-
self._rebuild_robot_frame_handles("right", robot_frames)
|
| 718 |
-
log.info(f"FK geom transforms computed for {len(transforms)} frames ({entry.robot_name})")
|
| 719 |
-
except Exception as e:
|
| 720 |
-
log.warning(f"FK failed for {entry.robot_name}: {e}")
|
| 721 |
-
import traceback
|
| 722 |
-
|
| 723 |
-
traceback.print_exc()
|
| 724 |
-
return
|
| 725 |
-
|
| 726 |
# Right arm IK
|
| 727 |
if state.right_poses is not None:
|
| 728 |
try:
|
|
@@ -807,8 +719,6 @@ class UnifiedRenderer:
|
|
| 807 |
self.left_traj,
|
| 808 |
]:
|
| 809 |
attr.visible = False
|
| 810 |
-
for h in self.right_fingers + self.left_fingers:
|
| 811 |
-
h.visible = False
|
| 812 |
for h in self.right_gripper_tips + self.left_gripper_tips:
|
| 813 |
h.visible = False
|
| 814 |
for h in self.robot_right + self.robot_left:
|
|
|
|
| 15 |
|
| 16 |
from cosmos_framework.utils import log
|
| 17 |
from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import RobotSceneModel
|
| 18 |
+
from cosmos_framework.data.vfm.action.urdf_visualizer.unified_action import SceneState
|
| 19 |
|
| 20 |
|
| 21 |
class UnifiedRenderer:
|
|
|
|
| 41 |
COLOR_EGO_TOP = (231, 76, 60) # red
|
| 42 |
COLOR_RIGHT = (243, 156, 18) # orange
|
| 43 |
COLOR_LEFT = (155, 89, 182) # purple
|
| 44 |
+
GRIPPER_TIP_COLORS = [(231, 76, 60), (241, 196, 15)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
@staticmethod
|
| 47 |
def _soften_color(color: tuple[int, int, int], mix: float = 0.35) -> tuple[int, int, int]:
|
|
|
|
| 129 |
point_size=0.015,
|
| 130 |
point_shape="circle",
|
| 131 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
self.right_gripper_tips = [
|
| 133 |
server.scene.add_icosphere(
|
| 134 |
self._p(f"/right/gripper_tip_{side}"),
|
| 135 |
radius=self.TIP_RADIUS,
|
| 136 |
+
color=self.GRIPPER_TIP_COLORS[i],
|
| 137 |
position=(0.0, 0.0, 0.0),
|
| 138 |
)
|
| 139 |
+
for i, side in enumerate(("left", "right"))
|
| 140 |
]
|
| 141 |
|
| 142 |
# ── Left effector ──
|
|
|
|
| 156 |
point_size=0.015,
|
| 157 |
point_shape="circle",
|
| 158 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
self.left_gripper_tips = [
|
| 160 |
server.scene.add_icosphere(
|
| 161 |
self._p(f"/left/gripper_tip_{side}"),
|
| 162 |
radius=self.TIP_RADIUS,
|
| 163 |
+
color=self.GRIPPER_TIP_COLORS[i],
|
| 164 |
position=(0.0, 0.0, 0.0),
|
| 165 |
)
|
| 166 |
+
for i, side in enumerate(("left", "right"))
|
| 167 |
]
|
| 168 |
|
| 169 |
# ── IK robot meshes ──
|
|
|
|
| 264 |
self.right_traj,
|
| 265 |
show,
|
| 266 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
self._update_gripper(
|
| 268 |
t,
|
| 269 |
state.right_poses,
|
|
|
|
| 284 |
self.left_traj,
|
| 285 |
show,
|
| 286 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
self._update_gripper(
|
| 288 |
t,
|
| 289 |
state.left_poses,
|
|
|
|
| 326 |
def _v(active):
|
| 327 |
return "✓" if active else "·"
|
| 328 |
|
|
|
|
|
|
|
|
|
|
| 329 |
# Gripper auxiliary values (not in 57D vector)
|
| 330 |
grip_r_str = ""
|
| 331 |
grip_l_str = ""
|
|
|
|
| 342 |
"",
|
| 343 |
f"{_v(mask.right_wrist)} R wrist pos [9:12] {_fmt(a[9:12])}",
|
| 344 |
f" {' ' * 1} rot [12:18] {_fmt(a[12:18])}",
|
|
|
|
| 345 |
]
|
|
|
|
|
|
|
| 346 |
if grip_r_str:
|
| 347 |
parts.append(grip_r_str)
|
| 348 |
|
|
|
|
| 350 |
"",
|
| 351 |
f"{_v(mask.left_wrist)} L wrist pos [33:36] {_fmt(a[33:36])}",
|
| 352 |
f" {' ' * 1} rot [36:42] {_fmt(a[36:42])}",
|
|
|
|
| 353 |
]
|
|
|
|
|
|
|
| 354 |
if grip_l_str:
|
| 355 |
parts.append(grip_l_str)
|
| 356 |
|
|
|
|
| 429 |
ee.visible = False
|
| 430 |
traj.visible = False
|
| 431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
# ─── Private: Gripper ─────────────────────────────────────────────────────
|
| 433 |
|
| 434 |
def _update_gripper(self, t, poses, gripper, wrist_active, finger_mask, handle, show):
|
|
|
|
| 635 |
if self._robot_scene_model is None:
|
| 636 |
return
|
| 637 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
# Right arm IK
|
| 639 |
if state.right_poses is not None:
|
| 640 |
try:
|
|
|
|
| 719 |
self.left_traj,
|
| 720 |
]:
|
| 721 |
attr.visible = False
|
|
|
|
|
|
|
| 722 |
for h in self.right_gripper_tips + self.left_gripper_tips:
|
| 723 |
h.visible = False
|
| 724 |
for h in self.robot_right + self.robot_left:
|
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ur5e_robotiq_2f85.xml
DELETED
|
@@ -1,326 +0,0 @@
|
|
| 1 |
-
<!--
|
| 2 |
-
SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 3 |
-
SPDX-License-Identifier: OpenMDW-1.1
|
| 4 |
-
-->
|
| 5 |
-
|
| 6 |
-
<!-- UR5e + Robotiq 2F-85 gripper composite MJCF.
|
| 7 |
-
Created for RoboMIND UR visualization.
|
| 8 |
-
Arm (UR5e): qpos[0:6] — shoulder_pan, shoulder_lift, elbow, wrist_1, wrist_2, wrist_3
|
| 9 |
-
Gripper (Robotiq 2F-85): controlled via fingers_actuator tendon (ctrlrange 0-255).
|
| 10 |
-
Gripper signal from dataset: 0=open, 1=closed; scale by 255 before setting ctrl.
|
| 11 |
-
Mesh assets from MuJoCo Menagerie:
|
| 12 |
-
universal_robots_ur5e/assets/ (UR5e OBJ meshes)
|
| 13 |
-
robotiq_2f85_v4/assets/ (Robotiq STL meshes)
|
| 14 |
-
Both are merged into a single assets/ directory by _build_ur5e_robotiq().
|
| 15 |
-
-->
|
| 16 |
-
<mujoco model="ur5e with robotiq 2f85">
|
| 17 |
-
<compiler angle="radian" meshdir="assets" autolimits="true"/>
|
| 18 |
-
|
| 19 |
-
<option integrator="implicitfast"/>
|
| 20 |
-
|
| 21 |
-
<default>
|
| 22 |
-
<default class="ur5e">
|
| 23 |
-
<material specular="0.5" shininess="0.25"/>
|
| 24 |
-
<joint axis="0 1 0" range="-6.28319 6.28319" armature="0.1"/>
|
| 25 |
-
<general gaintype="fixed" biastype="affine" ctrlrange="-6.2831 6.2831" gainprm="2000"
|
| 26 |
-
biasprm="0 -2000 -400" forcerange="-150 150"/>
|
| 27 |
-
<default class="size3">
|
| 28 |
-
<default class="size3_limited">
|
| 29 |
-
<joint range="-3.1415 3.1415"/>
|
| 30 |
-
<general ctrlrange="-3.1415 3.1415"/>
|
| 31 |
-
</default>
|
| 32 |
-
</default>
|
| 33 |
-
<default class="size1">
|
| 34 |
-
<general gainprm="500" biasprm="0 -500 -100" forcerange="-28 28"/>
|
| 35 |
-
</default>
|
| 36 |
-
<default class="visual">
|
| 37 |
-
<geom type="mesh" contype="0" conaffinity="0" group="2"/>
|
| 38 |
-
</default>
|
| 39 |
-
<default class="collision">
|
| 40 |
-
<geom type="capsule" group="3"/>
|
| 41 |
-
<default class="eef_collision">
|
| 42 |
-
<geom type="cylinder"/>
|
| 43 |
-
</default>
|
| 44 |
-
</default>
|
| 45 |
-
<site size="0.001" rgba="0.5 0.5 0.5 0.3" group="4"/>
|
| 46 |
-
</default>
|
| 47 |
-
|
| 48 |
-
<default class="2f85">
|
| 49 |
-
<mesh scale="0.001 0.001 0.001"/>
|
| 50 |
-
<general biastype="affine"/>
|
| 51 |
-
<joint axis="0 0 1"/>
|
| 52 |
-
<default class="driver">
|
| 53 |
-
<joint range="0 0.9" armature="0.005" damping="0.1"
|
| 54 |
-
solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
|
| 55 |
-
</default>
|
| 56 |
-
<default class="follower">
|
| 57 |
-
<joint range="-0.872664 0.9" armature="0.001"
|
| 58 |
-
solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
|
| 59 |
-
</default>
|
| 60 |
-
<default class="spring_link">
|
| 61 |
-
<joint range="-0.29670597283 0.9" armature="0.001"
|
| 62 |
-
stiffness="0.05" springref="2.62" damping="0.00125"/>
|
| 63 |
-
</default>
|
| 64 |
-
<default class="coupler">
|
| 65 |
-
<joint range="-1.57 0" armature="0.001"
|
| 66 |
-
solimplimit="0.95 0.99 0.001" solreflimit="0.005 1"/>
|
| 67 |
-
</default>
|
| 68 |
-
<default class="visual_gripper">
|
| 69 |
-
<geom type="mesh" contype="0" conaffinity="0" group="2" material="robotiq_black"/>
|
| 70 |
-
</default>
|
| 71 |
-
<default class="collision_gripper">
|
| 72 |
-
<geom group="3" type="mesh" contype="0" conaffinity="0"/>
|
| 73 |
-
<default class="pad_box1">
|
| 74 |
-
<geom mass="0.00175" type="box" pos="0.043258 0 0.12"
|
| 75 |
-
size="0.002 0.011 0.009375" friction="0.7"
|
| 76 |
-
solimp="0.95 0.99 0.001" solref="0.004 1" priority="1"
|
| 77 |
-
rgba="0.55 0.55 0.55 1"/>
|
| 78 |
-
</default>
|
| 79 |
-
<default class="pad_box2">
|
| 80 |
-
<geom mass="0.00175" type="box" pos="0.043258 0 0.13875"
|
| 81 |
-
size="0.002 0.011 0.009375" friction="0.6"
|
| 82 |
-
solimp="0.95 0.99 0.001" solref="0.004 1" priority="1"
|
| 83 |
-
rgba="0.45 0.45 0.45 1"/>
|
| 84 |
-
</default>
|
| 85 |
-
</default>
|
| 86 |
-
</default>
|
| 87 |
-
</default>
|
| 88 |
-
|
| 89 |
-
<asset>
|
| 90 |
-
<!-- UR5e materials -->
|
| 91 |
-
<material class="ur5e" name="ur5e_black" rgba="0.033 0.033 0.033 1"/>
|
| 92 |
-
<material class="ur5e" name="jointgray" rgba="0.278 0.278 0.278 1"/>
|
| 93 |
-
<material class="ur5e" name="linkgray" rgba="0.82 0.82 0.82 1"/>
|
| 94 |
-
<material class="ur5e" name="urblue" rgba="0.49 0.678 0.8 1"/>
|
| 95 |
-
|
| 96 |
-
<!-- Robotiq materials -->
|
| 97 |
-
<material name="robotiq_metal" rgba="0.58 0.58 0.58 1"/>
|
| 98 |
-
<material name="robotiq_silicone" rgba="0.1882 0.1882 0.1882 1"/>
|
| 99 |
-
<material name="robotiq_black" rgba="0.149 0.149 0.149 1"/>
|
| 100 |
-
|
| 101 |
-
<!-- UR5e meshes -->
|
| 102 |
-
<mesh file="base_0.obj"/>
|
| 103 |
-
<mesh file="base_1.obj"/>
|
| 104 |
-
<mesh file="shoulder_0.obj"/>
|
| 105 |
-
<mesh file="shoulder_1.obj"/>
|
| 106 |
-
<mesh file="shoulder_2.obj"/>
|
| 107 |
-
<mesh file="upperarm_0.obj"/>
|
| 108 |
-
<mesh file="upperarm_1.obj"/>
|
| 109 |
-
<mesh file="upperarm_2.obj"/>
|
| 110 |
-
<mesh file="upperarm_3.obj"/>
|
| 111 |
-
<mesh file="forearm_0.obj"/>
|
| 112 |
-
<mesh file="forearm_1.obj"/>
|
| 113 |
-
<mesh file="forearm_2.obj"/>
|
| 114 |
-
<mesh file="forearm_3.obj"/>
|
| 115 |
-
<mesh file="wrist1_0.obj"/>
|
| 116 |
-
<mesh file="wrist1_1.obj"/>
|
| 117 |
-
<mesh file="wrist1_2.obj"/>
|
| 118 |
-
<mesh file="wrist2_0.obj"/>
|
| 119 |
-
<mesh file="wrist2_1.obj"/>
|
| 120 |
-
<mesh file="wrist2_2.obj"/>
|
| 121 |
-
<mesh file="wrist3.obj"/>
|
| 122 |
-
|
| 123 |
-
<!-- Robotiq 2F-85 meshes — NO class: STL assets are in meters, scale=0.001 must NOT apply -->
|
| 124 |
-
<mesh file="base.stl"/>
|
| 125 |
-
<mesh file="base_coupling.stl"/>
|
| 126 |
-
<mesh file="c-a01-85-open.stl"/>
|
| 127 |
-
<mesh file="driver.stl"/>
|
| 128 |
-
<mesh file="coupler.stl"/>
|
| 129 |
-
<mesh file="spring_link.stl"/>
|
| 130 |
-
<mesh file="follower.stl"/>
|
| 131 |
-
<mesh file="tongue.stl"/>
|
| 132 |
-
</asset>
|
| 133 |
-
|
| 134 |
-
<worldbody>
|
| 135 |
-
<light name="spotlight" mode="targetbodycom" target="wrist_2_link" pos="0 -1 2"/>
|
| 136 |
-
<body name="base" quat="0 0 0 -1" childclass="ur5e">
|
| 137 |
-
<inertial mass="4.0" pos="0 0 0" diaginertia="0.00443333156 0.00443333156 0.0072"/>
|
| 138 |
-
<geom mesh="base_0" material="ur5e_black" class="visual"/>
|
| 139 |
-
<geom mesh="base_1" material="jointgray" class="visual"/>
|
| 140 |
-
<body name="shoulder_link" pos="0 0 0.163">
|
| 141 |
-
<inertial mass="3.7" pos="0 0 0" diaginertia="0.0102675 0.0102675 0.00666"/>
|
| 142 |
-
<joint name="shoulder_pan_joint" class="size3" axis="0 0 1"/>
|
| 143 |
-
<geom mesh="shoulder_0" material="urblue" class="visual"/>
|
| 144 |
-
<geom mesh="shoulder_1" material="ur5e_black" class="visual"/>
|
| 145 |
-
<geom mesh="shoulder_2" material="jointgray" class="visual"/>
|
| 146 |
-
<geom class="collision" size="0.06 0.06" pos="0 0 -0.04"/>
|
| 147 |
-
<body name="upper_arm_link" pos="0 0.138 0" quat="1 0 1 0">
|
| 148 |
-
<inertial mass="8.393" pos="0 0 0.2125" diaginertia="0.133886 0.133886 0.0151074"/>
|
| 149 |
-
<joint name="shoulder_lift_joint" class="size3"/>
|
| 150 |
-
<geom mesh="upperarm_0" material="linkgray" class="visual"/>
|
| 151 |
-
<geom mesh="upperarm_1" material="ur5e_black" class="visual"/>
|
| 152 |
-
<geom mesh="upperarm_2" material="jointgray" class="visual"/>
|
| 153 |
-
<geom mesh="upperarm_3" material="urblue" class="visual"/>
|
| 154 |
-
<geom class="collision" pos="0 -0.04 0" quat="1 1 0 0" size="0.06 0.06"/>
|
| 155 |
-
<geom class="collision" size="0.05 0.2" pos="0 0 0.2"/>
|
| 156 |
-
<body name="forearm_link" pos="0 -0.131 0.425">
|
| 157 |
-
<inertial mass="2.275" pos="0 0 0.196" diaginertia="0.0311796 0.0311796 0.004095"/>
|
| 158 |
-
<joint name="elbow_joint" class="size3_limited"/>
|
| 159 |
-
<geom mesh="forearm_0" material="urblue" class="visual"/>
|
| 160 |
-
<geom mesh="forearm_1" material="linkgray" class="visual"/>
|
| 161 |
-
<geom mesh="forearm_2" material="ur5e_black" class="visual"/>
|
| 162 |
-
<geom mesh="forearm_3" material="jointgray" class="visual"/>
|
| 163 |
-
<geom class="collision" pos="0 0.08 0" quat="1 1 0 0" size="0.055 0.06"/>
|
| 164 |
-
<geom class="collision" size="0.038 0.19" pos="0 0 0.2"/>
|
| 165 |
-
<body name="wrist_1_link" pos="0 0 0.392" quat="1 0 1 0">
|
| 166 |
-
<inertial mass="1.219" pos="0 0.127 0" diaginertia="0.0025599 0.0025599 0.0021942"/>
|
| 167 |
-
<joint name="wrist_1_joint" class="size1"/>
|
| 168 |
-
<geom mesh="wrist1_0" material="ur5e_black" class="visual"/>
|
| 169 |
-
<geom mesh="wrist1_1" material="urblue" class="visual"/>
|
| 170 |
-
<geom mesh="wrist1_2" material="jointgray" class="visual"/>
|
| 171 |
-
<geom class="collision" pos="0 0.05 0" quat="1 1 0 0" size="0.04 0.07"/>
|
| 172 |
-
<body name="wrist_2_link" pos="0 0.127 0">
|
| 173 |
-
<inertial mass="1.219" pos="0 0 0.1" diaginertia="0.0025599 0.0025599 0.0021942"/>
|
| 174 |
-
<joint name="wrist_2_joint" axis="0 0 1" class="size1"/>
|
| 175 |
-
<geom mesh="wrist2_0" material="ur5e_black" class="visual"/>
|
| 176 |
-
<geom mesh="wrist2_1" material="urblue" class="visual"/>
|
| 177 |
-
<geom mesh="wrist2_2" material="jointgray" class="visual"/>
|
| 178 |
-
<geom class="collision" size="0.04 0.06" pos="0 0 0.04"/>
|
| 179 |
-
<geom class="collision" pos="0 0.02 0.1" quat="1 1 0 0" size="0.04 0.04"/>
|
| 180 |
-
<body name="wrist_3_link" pos="0 0 0.1">
|
| 181 |
-
<inertial mass="0.1889" pos="0 0.0771683 0" quat="1 0 0 1"
|
| 182 |
-
diaginertia="0.000132134 9.90863e-05 9.90863e-05"/>
|
| 183 |
-
<joint name="wrist_3_joint" class="size1"/>
|
| 184 |
-
<geom material="linkgray" mesh="wrist3" class="visual"/>
|
| 185 |
-
<geom class="eef_collision" pos="0 0.08 0" quat="1 1 0 0" size="0.04 0.02"/>
|
| 186 |
-
<site name="attachment_site" pos="0 0.1 0" quat="-1 1 0 0"/>
|
| 187 |
-
<!-- Robotiq 2F-85 attached at the UR5e flange site -->
|
| 188 |
-
<body name="robotiq_base" childclass="2f85" pos="0 0.1 0" quat="-1 1 0 0">
|
| 189 |
-
<inertial mass="0.777441" pos="7.77116e-05 8.42713e-05 0.0311656"
|
| 190 |
-
quat="0.704758 -0.00373684 -0.00570287 0.709415"
|
| 191 |
-
diaginertia="0.000260285 0.000225381 0.000152708"/>
|
| 192 |
-
<geom class="visual_gripper" pos="0 0 0.0108" quat="0 0 0 1" mesh="base"/>
|
| 193 |
-
<geom class="visual_gripper" pos="0 0 0.004" quat="1 -1 0 0" mesh="base_coupling" material="robotiq_metal"/>
|
| 194 |
-
<geom class="visual_gripper" pos="0 0 0.0108" quat="1 0 0 0" material="robotiq_metal" mesh="c-a01-85-open"/>
|
| 195 |
-
<geom class="collision_gripper" pos="0 0 0.0108" quat="0 0 0 1" mesh="base"/>
|
| 196 |
-
<body name="robotiq_left_driver" pos="-0.0306011 0.00475 0.0657045" quat="1 -1 0 0">
|
| 197 |
-
<inertial mass="0.00899563" pos="-0.0175297 0.00165308 -0.00469625"
|
| 198 |
-
quat="-0.469642 0.469642 -0.528617 0.528617"
|
| 199 |
-
diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
|
| 200 |
-
<joint name="left_driver_joint" class="driver"/>
|
| 201 |
-
<geom class="visual_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" material="robotiq_metal" mesh="driver"/>
|
| 202 |
-
<geom class="collision_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" mesh="driver"/>
|
| 203 |
-
<body name="robotiq_left_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
|
| 204 |
-
<inertial mass="0.0140974" pos="0.00367747 0.01986 0.0055"
|
| 205 |
-
quat="0.701447 -0.701447 0.0892884 -0.0892884"
|
| 206 |
-
diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
|
| 207 |
-
<geom class="visual_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
|
| 208 |
-
<geom class="collision_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
|
| 209 |
-
</body>
|
| 210 |
-
</body>
|
| 211 |
-
<body name="robotiq_left_spring_link" pos="-0.0127 -0.012 0.07222" quat="1 -1 0 0">
|
| 212 |
-
<inertial mass="0.0221642" pos="-0.0183 -0.0205732 0.01205"
|
| 213 |
-
quat="0.660941 0.660941 -0.251309 -0.251309"
|
| 214 |
-
diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
|
| 215 |
-
<joint name="left_spring_link_joint" class="spring_link"/>
|
| 216 |
-
<geom class="visual_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
|
| 217 |
-
<geom class="collision_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
|
| 218 |
-
<body name="robotiq_left_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 -1.90231e-05 0">
|
| 219 |
-
<inertial mass="0.0125222" pos="-0.00852976 -0.0014822 -0.00910001"
|
| 220 |
-
quat="0.359439 0.359439 0.608937 0.608937"
|
| 221 |
-
diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
|
| 222 |
-
<joint name="left_follower_joint" class="follower"/>
|
| 223 |
-
<geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
|
| 224 |
-
<geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" material="robotiq_metal" mesh="tongue"/>
|
| 225 |
-
<geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
|
| 226 |
-
<geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="tongue"/>
|
| 227 |
-
<body name="robotiq_left_pad" pos="-0.0377897 -0.103916 -0.0091" quat="1 -1 0 0">
|
| 228 |
-
<geom class="pad_box1" name="left_pad1"/>
|
| 229 |
-
<geom class="pad_box2" name="left_pad2"/>
|
| 230 |
-
</body>
|
| 231 |
-
</body>
|
| 232 |
-
</body>
|
| 233 |
-
<body name="robotiq_right_driver" pos="0.0306011 -0.00475 0.0657045" quat="0 0 -1 1">
|
| 234 |
-
<inertial mass="0.00899563" pos="-0.0175297 0.00165308 -0.00469625"
|
| 235 |
-
quat="-0.469642 0.469642 -0.528617 0.528617"
|
| 236 |
-
diaginertia="1.72352e-06 1.60906e-06 3.22006e-07"/>
|
| 237 |
-
<joint name="right_driver_joint" class="driver"/>
|
| 238 |
-
<geom class="visual_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" material="robotiq_metal" mesh="driver"/>
|
| 239 |
-
<geom class="collision_gripper" pos="0.0306011 0.0549045 -0.0047" quat="1 1 0 0" mesh="driver"/>
|
| 240 |
-
<body name="robotiq_right_coupler" pos="-0.0314249 0.00453223 -0.0102" quat="0 0 0 1">
|
| 241 |
-
<inertial mass="0.0140974" pos="0.00367747 0.01986 0.0055"
|
| 242 |
-
quat="0.701447 -0.701447 0.0892884 -0.0892884"
|
| 243 |
-
diaginertia="4.16206e-06 3.52216e-06 8.88131e-07"/>
|
| 244 |
-
<geom class="visual_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
|
| 245 |
-
<geom class="collision_gripper" pos="-0.062026 -0.0503723 0.0055" quat="1 -1 0 0" mesh="coupler"/>
|
| 246 |
-
</body>
|
| 247 |
-
</body>
|
| 248 |
-
<body name="robotiq_right_spring_link" pos="0.0127 0.012 0.07222" quat="0 0 -1 1">
|
| 249 |
-
<inertial mass="0.0221642" pos="-0.0183 -0.0205732 0.01205"
|
| 250 |
-
quat="0.660941 0.660941 -0.251309 -0.251309"
|
| 251 |
-
diaginertia="8.96853e-06 6.71733e-06 2.63931e-06"/>
|
| 252 |
-
<joint name="right_spring_link_joint" class="spring_link"/>
|
| 253 |
-
<geom class="visual_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
|
| 254 |
-
<geom class="collision_gripper" pos="0.0127 0.06142 0.01205" quat="1 1 0 0" mesh="spring_link"/>
|
| 255 |
-
<body name="robotiq_right_follower" pos="-0.0382079 -0.0425003 0.00295" quat="0 -1 0 0">
|
| 256 |
-
<inertial mass="0.0125222" pos="-0.00852976 -0.0014822 -0.00910001"
|
| 257 |
-
quat="0.359439 0.359439 0.608937 0.608937"
|
| 258 |
-
diaginertia="2.67415e-06 2.4559e-06 6.02031e-07"/>
|
| 259 |
-
<joint name="right_follower_joint" class="follower"/>
|
| 260 |
-
<geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" material="robotiq_metal" mesh="tongue"/>
|
| 261 |
-
<geom class="visual_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
|
| 262 |
-
<geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="tongue"/>
|
| 263 |
-
<geom class="collision_gripper" pos="0.0509079 -0.10392 -0.0091" quat="1 -1 0 0" mesh="follower"/>
|
| 264 |
-
<body name="robotiq_right_pad" pos="-0.0377897 -0.103916 -0.0091" quat="1 -1 0 0">
|
| 265 |
-
<geom class="pad_box1" name="right_pad1"/>
|
| 266 |
-
<geom class="pad_box2" name="right_pad2"/>
|
| 267 |
-
</body>
|
| 268 |
-
</body>
|
| 269 |
-
</body>
|
| 270 |
-
</body>
|
| 271 |
-
</body>
|
| 272 |
-
</body>
|
| 273 |
-
</body>
|
| 274 |
-
</body>
|
| 275 |
-
</body>
|
| 276 |
-
</body>
|
| 277 |
-
</body>
|
| 278 |
-
</worldbody>
|
| 279 |
-
|
| 280 |
-
<!-- UR5e arm actuators -->
|
| 281 |
-
<actuator>
|
| 282 |
-
<general class="size3" name="shoulder_pan" joint="shoulder_pan_joint"/>
|
| 283 |
-
<general class="size3" name="shoulder_lift" joint="shoulder_lift_joint"/>
|
| 284 |
-
<general class="size3_limited" name="elbow" joint="elbow_joint"/>
|
| 285 |
-
<general class="size1" name="wrist_1" joint="wrist_1_joint"/>
|
| 286 |
-
<general class="size1" name="wrist_2" joint="wrist_2_joint"/>
|
| 287 |
-
<general class="size1" name="wrist_3" joint="wrist_3_joint"/>
|
| 288 |
-
<!-- Robotiq gripper actuator (ctrl 0=open, 255=closed) -->
|
| 289 |
-
<general class="2f85" name="fingers_actuator" tendon="split"
|
| 290 |
-
forcerange="-5 5" ctrlrange="0 255"
|
| 291 |
-
gainprm="0.3137255 0 0" biasprm="0 -100 -10"/>
|
| 292 |
-
</actuator>
|
| 293 |
-
|
| 294 |
-
<contact>
|
| 295 |
-
<exclude body1="robotiq_base" body2="robotiq_left_driver"/>
|
| 296 |
-
<exclude body1="robotiq_base" body2="robotiq_right_driver"/>
|
| 297 |
-
<exclude body1="robotiq_base" body2="robotiq_left_spring_link"/>
|
| 298 |
-
<exclude body1="robotiq_base" body2="robotiq_right_spring_link"/>
|
| 299 |
-
<exclude body1="robotiq_right_coupler" body2="robotiq_right_follower"/>
|
| 300 |
-
<exclude body1="robotiq_left_coupler" body2="robotiq_left_follower"/>
|
| 301 |
-
</contact>
|
| 302 |
-
|
| 303 |
-
<tendon>
|
| 304 |
-
<fixed name="split">
|
| 305 |
-
<joint joint="right_driver_joint" coef="0.485"/>
|
| 306 |
-
<joint joint="left_driver_joint" coef="0.485"/>
|
| 307 |
-
</fixed>
|
| 308 |
-
</tendon>
|
| 309 |
-
|
| 310 |
-
<equality>
|
| 311 |
-
<connect anchor="-0.0179014 -0.00651468 0.0044"
|
| 312 |
-
body1="robotiq_right_follower" body2="robotiq_right_coupler"
|
| 313 |
-
solimp="0.95 0.99 0.001" solref="0.005 1"/>
|
| 314 |
-
<connect anchor="-0.0179014 -0.00651468 0.0044"
|
| 315 |
-
body1="robotiq_left_follower" body2="robotiq_left_coupler"
|
| 316 |
-
solimp="0.95 0.99 0.001" solref="0.005 1"/>
|
| 317 |
-
<joint joint1="right_driver_joint" joint2="left_driver_joint"
|
| 318 |
-
polycoef="0 1 0 0 0" solimp="0.95 0.99 0.001" solref="0.005 1"/>
|
| 319 |
-
</equality>
|
| 320 |
-
|
| 321 |
-
<keyframe>
|
| 322 |
-
<key name="home" qpos="-1.5708 -1.5708 1.5708 -1.5708 -1.5708 0 0 0 0 0 0 0"
|
| 323 |
-
ctrl="-1.5708 -1.5708 1.5708 -1.5708 -1.5708 0 0"/>
|
| 324 |
-
</keyframe>
|
| 325 |
-
|
| 326 |
-
</mujoco>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/urdf_loader.py
CHANGED
|
@@ -164,7 +164,6 @@ _EE_FRAME_CANDIDATES = [
|
|
| 164 |
"gripper", # Google Robot (Menagerie site)
|
| 165 |
"hand", # Franka Panda (standard gripper)
|
| 166 |
"attachment", # Franka + Robotiq
|
| 167 |
-
"attachment_site", # UR5e (mujoco_menagerie)
|
| 168 |
"wx250s/gripper_link", # WidowX 250S (Menagerie)
|
| 169 |
"ee_link", # generic
|
| 170 |
"tool0", # generic industrial
|
|
@@ -257,19 +256,6 @@ ROBOT_CONFIGS = {
|
|
| 257 |
"finger_joint_names": ["left_finger", "right_finger"],
|
| 258 |
"camera_body": None,
|
| 259 |
},
|
| 260 |
-
"ur5e": {
|
| 261 |
-
"menagerie": "ur5e_robotiq",
|
| 262 |
-
"mjcf": "ur5e_robotiq_2f85.xml",
|
| 263 |
-
# EE: attachment_site — the UR5e flange site where the Robotiq mounts.
|
| 264 |
-
# joint_configs[:, 6] raw UR gripper maps directly to Robotiq ctrl:
|
| 265 |
-
# ctrl = raw * 255 (0=open, 255=closed).
|
| 266 |
-
"ee_frame": "attachment_site",
|
| 267 |
-
"n_arm_joints": 6,
|
| 268 |
-
"finger_min": 0.0,
|
| 269 |
-
"finger_max": 255.0,
|
| 270 |
-
"finger_close_is_max": False,
|
| 271 |
-
"camera_body": None,
|
| 272 |
-
},
|
| 273 |
|
| 274 |
}
|
| 275 |
|
|
@@ -344,10 +330,6 @@ def _ensure_robot_assets(robot_name: str) -> Path:
|
|
| 344 |
if robot_name == "droid_franka_robotiq":
|
| 345 |
return _build_droid_franka_robotiq()
|
| 346 |
|
| 347 |
-
# Composite: UR5e arm + Robotiq 2F-85 gripper (RoboMIND UR setup)
|
| 348 |
-
if robot_name == "ur5e_robotiq":
|
| 349 |
-
return _build_ur5e_robotiq()
|
| 350 |
-
|
| 351 |
log.info(f"Downloading {robot_name} assets from mujoco_menagerie...")
|
| 352 |
_download_menagerie_model(robot_name)
|
| 353 |
return cached_dir
|
|
@@ -420,38 +402,6 @@ def _build_droid_franka_robotiq() -> Path:
|
|
| 420 |
return dst
|
| 421 |
|
| 422 |
|
| 423 |
-
def _build_ur5e_robotiq() -> Path:
|
| 424 |
-
"""Prepare composite UR5e + Robotiq 2F-85 MJCF for RoboMIND UR visualization.
|
| 425 |
-
|
| 426 |
-
The hand-tuned composite XML is committed in the repo alongside this
|
| 427 |
-
module (``ur5e_robotiq_2f85.xml``). This function downloads the mesh
|
| 428 |
-
assets from MuJoCo Menagerie (``universal_robots_ur5e`` +
|
| 429 |
-
``robotiq_2f85_v4``) and copies them together with the XML into a cache
|
| 430 |
-
directory that MuJoCo can load from.
|
| 431 |
-
"""
|
| 432 |
-
import shutil
|
| 433 |
-
|
| 434 |
-
log.info("Preparing composite UR5e + Robotiq 2F-85 (committed XML + Menagerie assets)...")
|
| 435 |
-
|
| 436 |
-
ur5e_dir = _download_menagerie_model("universal_robots_ur5e")
|
| 437 |
-
robotiq_dir = _download_menagerie_model("robotiq_2f85_v4")
|
| 438 |
-
|
| 439 |
-
dst = _ROBOT_CACHE_DIR / "ur5e_robotiq"
|
| 440 |
-
dst.mkdir(parents=True, exist_ok=True)
|
| 441 |
-
assets_dir = dst / "assets"
|
| 442 |
-
assets_dir.mkdir(exist_ok=True)
|
| 443 |
-
|
| 444 |
-
for f in (ur5e_dir / "assets").iterdir():
|
| 445 |
-
shutil.copy2(f, assets_dir / f.name)
|
| 446 |
-
for f in (robotiq_dir / "assets").iterdir():
|
| 447 |
-
shutil.copy2(f, assets_dir / f.name)
|
| 448 |
-
|
| 449 |
-
committed_xml = Path(__file__).parent / "ur5e_robotiq_2f85.xml"
|
| 450 |
-
shutil.copy2(committed_xml, dst / "ur5e_robotiq_2f85.xml")
|
| 451 |
-
|
| 452 |
-
log.info(f"Prepared composite UR5e + Robotiq 2F-85 at {dst}")
|
| 453 |
-
return dst
|
| 454 |
-
|
| 455 |
|
| 456 |
# ── Robot loaders ────────────────────────────────────────────────────────────
|
| 457 |
|
|
@@ -563,35 +513,6 @@ def _load_widowx() -> tuple[list, np.ndarray]:
|
|
| 563 |
return meshes, ee_pose
|
| 564 |
|
| 565 |
|
| 566 |
-
def _load_ur5e() -> tuple[list, np.ndarray]:
|
| 567 |
-
"""Load UR5e + Robotiq 2F-85 composite from MuJoCo Menagerie."""
|
| 568 |
-
import mujoco
|
| 569 |
-
|
| 570 |
-
mjcf_dir = _ensure_robot_assets("ur5e_robotiq")
|
| 571 |
-
mjcf_path = mjcf_dir / "ur5e_robotiq_2f85.xml"
|
| 572 |
-
model = mujoco.MjModel.from_xml_path(str(mjcf_path))
|
| 573 |
-
data = mujoco.MjData(model)
|
| 574 |
-
|
| 575 |
-
# UR5e home: -90, -90, 90, -90, -90, 0 (degrees → radians); gripper open
|
| 576 |
-
home_qpos = np.array([-1.5708, -1.5708, 1.5708, -1.5708, -1.5708, 0.0])
|
| 577 |
-
data.qpos[: len(home_qpos)] = home_qpos
|
| 578 |
-
mujoco.mj_forward(model, data)
|
| 579 |
-
|
| 580 |
-
meshes = _extract_mujoco_meshes(model, data)
|
| 581 |
-
|
| 582 |
-
# EE pose: use robotiq_base body (the flange-to-gripper attachment point)
|
| 583 |
-
ee_pose = np.eye(4, dtype=np.float32)
|
| 584 |
-
for i in range(model.nbody):
|
| 585 |
-
name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_BODY, i) or ""
|
| 586 |
-
if name == "robotiq_base":
|
| 587 |
-
ee_pose[:3, 3] = data.xpos[i].astype(np.float32)
|
| 588 |
-
ee_pose[:3, :3] = data.xmat[i].reshape(3, 3).astype(np.float32)
|
| 589 |
-
break
|
| 590 |
-
|
| 591 |
-
log.info(f"UR5e+Robotiq loaded: {len(meshes)} meshes, EE pos={ee_pose[:3, 3]}")
|
| 592 |
-
return meshes, ee_pose
|
| 593 |
-
|
| 594 |
-
|
| 595 |
def _parse_urdf_origin(origin_element) -> np.ndarray:
|
| 596 |
transform = np.eye(4, dtype=np.float32)
|
| 597 |
if origin_element is None:
|
|
@@ -856,64 +777,4 @@ def get_robot_loaders() -> dict[str, callable]:
|
|
| 856 |
"google_robot": _load_google_robot,
|
| 857 |
"franka_panda": _load_franka_panda,
|
| 858 |
"widowx": _load_widowx,
|
| 859 |
-
"ur5e": _load_ur5e,
|
| 860 |
}
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
def extract_gripper_openings(unified_57d: np.ndarray, robot_name: str = "google_robot") -> np.ndarray:
|
| 864 |
-
"""Extract gripper opening fractions from unified action grasp state.
|
| 865 |
-
|
| 866 |
-
Uses fingertip spread (f0-f1 distance) to invert the FK and recover
|
| 867 |
-
the scalar gripper opening at each timestep.
|
| 868 |
-
|
| 869 |
-
Args:
|
| 870 |
-
unified_57d: (T, 57) unified action.
|
| 871 |
-
robot_name: Robot identifier for FK lookup table.
|
| 872 |
-
|
| 873 |
-
Returns:
|
| 874 |
-
(T+1,) array of gripper openings in [0, 1].
|
| 875 |
-
"""
|
| 876 |
-
T = unified_57d.shape[0]
|
| 877 |
-
grasp = unified_57d[:, 18:33].reshape(T, 5, 3)
|
| 878 |
-
all_grasp = np.concatenate([grasp[0:1], grasp], axis=0)
|
| 879 |
-
|
| 880 |
-
# Build monotonic inverse lookup from FK (robot-specific)
|
| 881 |
-
_gs = np.linspace(0, 1, 10001).astype(np.float32)
|
| 882 |
-
try:
|
| 883 |
-
if robot_name == "franka_panda":
|
| 884 |
-
from cosmos_framework.data.vfm.action.robot_descriptions.franka import franka_fingertip_fk
|
| 885 |
-
|
| 886 |
-
_tips = franka_fingertip_fk(_gs)
|
| 887 |
-
elif robot_name == "widowx":
|
| 888 |
-
from cosmos_framework.data.vfm.action.robot_descriptions.widowx import widowx_fingertip_fk
|
| 889 |
-
|
| 890 |
-
_tips = widowx_fingertip_fk(_gs)
|
| 891 |
-
elif robot_name == "ur5e":
|
| 892 |
-
from cosmos_framework.data.vfm.action.robot_descriptions.umi import _WSG50_MAX_WIDTH, umi_fingertip_fk
|
| 893 |
-
|
| 894 |
-
_tips = umi_fingertip_fk(_gs * _WSG50_MAX_WIDTH)
|
| 895 |
-
else:
|
| 896 |
-
from cosmos_framework.data.vfm.action.robot_descriptions.google_robot import (
|
| 897 |
-
google_robot_fingertip_fk_vectorized,
|
| 898 |
-
)
|
| 899 |
-
|
| 900 |
-
_tips = google_robot_fingertip_fk_vectorized(_gs)
|
| 901 |
-
_spreads = np.linalg.norm(_tips[:, 0] - _tips[:, 1], axis=1)
|
| 902 |
-
except ImportError:
|
| 903 |
-
# Fallback: linear approximation
|
| 904 |
-
_spreads = _gs * 0.145
|
| 905 |
-
_min_idx = int(np.argmin(_spreads))
|
| 906 |
-
mono_gs = _gs[_min_idx:]
|
| 907 |
-
mono_spreads = _spreads[_min_idx:]
|
| 908 |
-
|
| 909 |
-
openings = np.zeros(T + 1, dtype=np.float32)
|
| 910 |
-
for t in range(T + 1):
|
| 911 |
-
f0, f1 = all_grasp[t, 0], all_grasp[t, 1]
|
| 912 |
-
spread = np.linalg.norm(f0 - f1)
|
| 913 |
-
if spread <= mono_spreads[0]:
|
| 914 |
-
openings[t] = 0.0
|
| 915 |
-
elif spread >= mono_spreads[-1]:
|
| 916 |
-
openings[t] = 1.0
|
| 917 |
-
else:
|
| 918 |
-
openings[t] = float(np.interp(spread, mono_spreads, mono_gs))
|
| 919 |
-
return openings
|
|
|
|
| 164 |
"gripper", # Google Robot (Menagerie site)
|
| 165 |
"hand", # Franka Panda (standard gripper)
|
| 166 |
"attachment", # Franka + Robotiq
|
|
|
|
| 167 |
"wx250s/gripper_link", # WidowX 250S (Menagerie)
|
| 168 |
"ee_link", # generic
|
| 169 |
"tool0", # generic industrial
|
|
|
|
| 256 |
"finger_joint_names": ["left_finger", "right_finger"],
|
| 257 |
"camera_body": None,
|
| 258 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
}
|
| 261 |
|
|
|
|
| 330 |
if robot_name == "droid_franka_robotiq":
|
| 331 |
return _build_droid_franka_robotiq()
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
log.info(f"Downloading {robot_name} assets from mujoco_menagerie...")
|
| 334 |
_download_menagerie_model(robot_name)
|
| 335 |
return cached_dir
|
|
|
|
| 402 |
return dst
|
| 403 |
|
| 404 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
# ── Robot loaders ────────────────────────────────────────────────────────────
|
| 407 |
|
|
|
|
| 513 |
return meshes, ee_pose
|
| 514 |
|
| 515 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
def _parse_urdf_origin(origin_element) -> np.ndarray:
|
| 517 |
transform = np.eye(4, dtype=np.float32)
|
| 518 |
if origin_element is None:
|
|
|
|
| 777 |
"google_robot": _load_google_robot,
|
| 778 |
"franka_panda": _load_franka_panda,
|
| 779 |
"widowx": _load_widowx,
|
|
|
|
| 780 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py
CHANGED
|
@@ -4,11 +4,8 @@
|
|
| 4 |
|
| 5 |
"""Interactive 3D viewer for robot action datasets.
|
| 6 |
|
| 7 |
-
Uses
|
| 8 |
-
|
| 9 |
-
``UnifiedAction(action_57d, mask)`` before rendering.
|
| 10 |
-
|
| 11 |
-
**57D layout**: ``[ego(9) | R_wrist(9) | R_fingers(15) | L_wrist(9) | L_fingers(15)]``
|
| 12 |
|
| 13 |
Dependencies::
|
| 14 |
|
|
@@ -18,8 +15,6 @@ Usage:
|
|
| 18 |
# Use each dataset's declared raw action format:
|
| 19 |
uv run python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --share
|
| 20 |
|
| 21 |
-
# Override the raw action format explicitly:
|
| 22 |
-
uv run python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --action-format 57d --share
|
| 23 |
"""
|
| 24 |
|
| 25 |
from __future__ import annotations
|
|
@@ -92,7 +87,6 @@ def _lazycfg_to_entry(
|
|
| 92 |
dataset_class = target if isinstance(target, str) else f"{target.__module__}.{target.__qualname__}"
|
| 93 |
ds_items = ds_cfg.items() if isinstance(ds_cfg, dict) else ds_cfg.items()
|
| 94 |
dataset_kwargs = {key: value for key, value in ds_items if key != "_target_"}
|
| 95 |
-
dataset_kwargs["action_normalization"] = None
|
| 96 |
if viewer_overrides is not None:
|
| 97 |
dataset_kwargs.update(viewer_overrides)
|
| 98 |
|
|
@@ -131,8 +125,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
|
|
| 131 |
DATASET_UMI_256,
|
| 132 |
)
|
| 133 |
|
| 134 |
-
raw_action_override = {"action_normalization": None}
|
| 135 |
-
|
| 136 |
from cosmos_framework.data.vfm.action.bridge_orig_lerobot_dataset import _BRIDGE_TO_OPENCV
|
| 137 |
from cosmos_framework.data.vfm.action.droid_lerobot_dataset import _DROID_TO_OPENCV
|
| 138 |
from cosmos_framework.data.vfm.action.fractal import _GOOGLE_ROBOT_TO_OPENCV
|
|
@@ -147,7 +139,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
|
|
| 147 |
max_finger_width=0.0,
|
| 148 |
fps=10,
|
| 149 |
action_format=ActionFormat.EGO_9D,
|
| 150 |
-
viewer_overrides=raw_action_override,
|
| 151 |
),
|
| 152 |
"fractal": _lazycfg_to_entry(
|
| 153 |
DATASET_FRACTAL_256,
|
|
@@ -158,7 +149,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
|
|
| 158 |
camera_fov_deg=69.0,
|
| 159 |
camera_aspect=320 / 256,
|
| 160 |
to_opencv=_GOOGLE_ROBOT_TO_OPENCV,
|
| 161 |
-
viewer_overrides=raw_action_override,
|
| 162 |
),
|
| 163 |
"bridge": _lazycfg_to_entry(
|
| 164 |
DATASET_BRIDGE_480,
|
|
@@ -167,7 +157,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
|
|
| 167 |
fps=5,
|
| 168 |
action_format=ActionFormat.SINGLE_ARM_10D,
|
| 169 |
to_opencv=_BRIDGE_TO_OPENCV,
|
| 170 |
-
viewer_overrides=raw_action_override,
|
| 171 |
),
|
| 172 |
"droid": _lazycfg_to_entry(
|
| 173 |
DATASET_DROID_480,
|
|
@@ -176,7 +165,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
|
|
| 176 |
fps=15,
|
| 177 |
action_format=ActionFormat.SINGLE_ARM_10D,
|
| 178 |
to_opencv=_DROID_TO_OPENCV,
|
| 179 |
-
viewer_overrides=raw_action_override,
|
| 180 |
),
|
| 181 |
"umi": _lazycfg_to_entry(
|
| 182 |
DATASET_UMI_256,
|
|
@@ -184,7 +172,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
|
|
| 184 |
max_finger_width=0.0,
|
| 185 |
fps=20,
|
| 186 |
action_format=ActionFormat.SINGLE_ARM_10D,
|
| 187 |
-
viewer_overrides=raw_action_override,
|
| 188 |
),
|
| 189 |
"robomind_franka": _lazycfg_to_entry(
|
| 190 |
DATASET_ROBOMIND_FRANKA_480,
|
|
@@ -193,7 +180,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
|
|
| 193 |
fps=10,
|
| 194 |
action_format=ActionFormat.SINGLE_ARM_10D,
|
| 195 |
to_opencv=_FRANKA_TO_OPENCV,
|
| 196 |
-
viewer_overrides=raw_action_override,
|
| 197 |
),
|
| 198 |
"robomind_franka_dual": _lazycfg_to_entry(
|
| 199 |
DATASET_ROBOMIND_FRANKA_DUAL_480,
|
|
@@ -210,7 +196,6 @@ def _build_datasets() -> dict[str, DatasetEntry]:
|
|
| 210 |
dtype=np.float32,
|
| 211 |
),
|
| 212 |
to_opencv=_FRANKA_TO_OPENCV,
|
| 213 |
-
viewer_overrides=raw_action_override,
|
| 214 |
),
|
| 215 |
}
|
| 216 |
|
|
@@ -238,7 +223,7 @@ def _create_dataset(entry: DatasetEntry, chunk_length: int):
|
|
| 238 |
|
| 239 |
# UMI: factory function
|
| 240 |
if callable(cls) and not inspect.isclass(cls):
|
| 241 |
-
_OMEGACONF_BLOCKLIST = {"chunk_length", "split", "
|
| 242 |
kwargs = {k: v for k, v in kwargs.items() if k not in _OMEGACONF_BLOCKLIST}
|
| 243 |
kwargs["eager_load"] = True
|
| 244 |
return cls(**kwargs)
|
|
@@ -312,17 +297,6 @@ def _build_viewer_idle_action_spec(action_format: ActionFormat) -> Any:
|
|
| 312 |
Rot("rot6d", prefix="right"),
|
| 313 |
Gripper(prefix="right"),
|
| 314 |
)
|
| 315 |
-
if action_format is ActionFormat.UNIFIED_57D:
|
| 316 |
-
return build_action_spec(
|
| 317 |
-
Pos(prefix="ego"),
|
| 318 |
-
Rot("rot6d", prefix="ego"),
|
| 319 |
-
Pos(prefix="right_wrist"),
|
| 320 |
-
Rot("rot6d", prefix="right_wrist"),
|
| 321 |
-
Pos(dim=15, prefix="right_fingers"),
|
| 322 |
-
Pos(prefix="left_wrist"),
|
| 323 |
-
Rot("rot6d", prefix="left_wrist"),
|
| 324 |
-
Pos(dim=15, prefix="left_fingers"),
|
| 325 |
-
)
|
| 326 |
raise ValueError(f"Unsupported action format for idle-frame detection: {action_format}")
|
| 327 |
|
| 328 |
|
|
@@ -622,7 +596,7 @@ def launch_viewer(
|
|
| 622 |
cam_panel = client.gui.add_image(np.zeros((64, 64, 3), dtype=np.uint8))
|
| 623 |
renderer.set_video_panel(cam_panel)
|
| 624 |
|
| 625 |
-
with client.gui.add_folder("Action
|
| 626 |
action_text = client.gui.add_markdown("*No episode loaded*")
|
| 627 |
|
| 628 |
show = {
|
|
@@ -645,7 +619,7 @@ def launch_viewer(
|
|
| 645 |
)
|
| 646 |
|
| 647 |
def _update_action_text(t: int) -> None:
|
| 648 |
-
"""Update the
|
| 649 |
txt = renderer.format_action_text(t)
|
| 650 |
action_text.content = txt if txt else "*No data*"
|
| 651 |
|
|
@@ -780,14 +754,6 @@ def launch_viewer(
|
|
| 780 |
)
|
| 781 |
state.video = get_video_from_sample(sample)
|
| 782 |
|
| 783 |
-
# Inject FK joint configs when the dataset provides them (e.g. UR).
|
| 784 |
-
jc = sample.get("joint_configs")
|
| 785 |
-
if jc is not None:
|
| 786 |
-
state.joint_configs = (
|
| 787 |
-
jc.numpy().astype(np.float32)
|
| 788 |
-
if isinstance(jc, torch.Tensor)
|
| 789 |
-
else np.asarray(jc, dtype=np.float32)
|
| 790 |
-
)
|
| 791 |
status_text.content = "⏳ Loading robot animation..."
|
| 792 |
renderer.load(state, entry, to_opencv=to_opencv)
|
| 793 |
_rebuild_robot_frame_toggles()
|
|
@@ -805,7 +771,7 @@ def launch_viewer(
|
|
| 805 |
+ (f"Task: {ai_caption_text}\n\n" if ai_caption_text else "")
|
| 806 |
+ (f"Debug: {debug_caption_text}\n\n" if debug_caption_text else "")
|
| 807 |
+ (
|
| 808 |
-
f"Steps: {T} | Raw: {raw_action_label} ({action_raw.shape[-1]}D)
|
| 809 |
f"Robot: {entry.robot_name or '—'} | FPS: {entry.fps}"
|
| 810 |
)
|
| 811 |
)
|
|
@@ -947,7 +913,7 @@ def launch_viewer(
|
|
| 947 |
|
| 948 |
|
| 949 |
def main():
|
| 950 |
-
parser = argparse.ArgumentParser(description="Action dataset viewer
|
| 951 |
parser.add_argument("--port", type=int, default=8013)
|
| 952 |
parser.add_argument("--share", action="store_true")
|
| 953 |
parser.add_argument("--chunk-length", type=int, default=16)
|
|
|
|
| 4 |
|
| 5 |
"""Interactive 3D viewer for robot action datasets.
|
| 6 |
|
| 7 |
+
Uses raw release action layouts: 9D ego/camera, 10D single-arm, and 20D dual-arm.
|
| 8 |
+
Each sample is converted to a small internal render state before drawing.
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
Dependencies::
|
| 11 |
|
|
|
|
| 15 |
# Use each dataset's declared raw action format:
|
| 16 |
uv run python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --share
|
| 17 |
|
|
|
|
|
|
|
| 18 |
"""
|
| 19 |
|
| 20 |
from __future__ import annotations
|
|
|
|
| 87 |
dataset_class = target if isinstance(target, str) else f"{target.__module__}.{target.__qualname__}"
|
| 88 |
ds_items = ds_cfg.items() if isinstance(ds_cfg, dict) else ds_cfg.items()
|
| 89 |
dataset_kwargs = {key: value for key, value in ds_items if key != "_target_"}
|
|
|
|
| 90 |
if viewer_overrides is not None:
|
| 91 |
dataset_kwargs.update(viewer_overrides)
|
| 92 |
|
|
|
|
| 125 |
DATASET_UMI_256,
|
| 126 |
)
|
| 127 |
|
|
|
|
|
|
|
| 128 |
from cosmos_framework.data.vfm.action.bridge_orig_lerobot_dataset import _BRIDGE_TO_OPENCV
|
| 129 |
from cosmos_framework.data.vfm.action.droid_lerobot_dataset import _DROID_TO_OPENCV
|
| 130 |
from cosmos_framework.data.vfm.action.fractal import _GOOGLE_ROBOT_TO_OPENCV
|
|
|
|
| 139 |
max_finger_width=0.0,
|
| 140 |
fps=10,
|
| 141 |
action_format=ActionFormat.EGO_9D,
|
|
|
|
| 142 |
),
|
| 143 |
"fractal": _lazycfg_to_entry(
|
| 144 |
DATASET_FRACTAL_256,
|
|
|
|
| 149 |
camera_fov_deg=69.0,
|
| 150 |
camera_aspect=320 / 256,
|
| 151 |
to_opencv=_GOOGLE_ROBOT_TO_OPENCV,
|
|
|
|
| 152 |
),
|
| 153 |
"bridge": _lazycfg_to_entry(
|
| 154 |
DATASET_BRIDGE_480,
|
|
|
|
| 157 |
fps=5,
|
| 158 |
action_format=ActionFormat.SINGLE_ARM_10D,
|
| 159 |
to_opencv=_BRIDGE_TO_OPENCV,
|
|
|
|
| 160 |
),
|
| 161 |
"droid": _lazycfg_to_entry(
|
| 162 |
DATASET_DROID_480,
|
|
|
|
| 165 |
fps=15,
|
| 166 |
action_format=ActionFormat.SINGLE_ARM_10D,
|
| 167 |
to_opencv=_DROID_TO_OPENCV,
|
|
|
|
| 168 |
),
|
| 169 |
"umi": _lazycfg_to_entry(
|
| 170 |
DATASET_UMI_256,
|
|
|
|
| 172 |
max_finger_width=0.0,
|
| 173 |
fps=20,
|
| 174 |
action_format=ActionFormat.SINGLE_ARM_10D,
|
|
|
|
| 175 |
),
|
| 176 |
"robomind_franka": _lazycfg_to_entry(
|
| 177 |
DATASET_ROBOMIND_FRANKA_480,
|
|
|
|
| 180 |
fps=10,
|
| 181 |
action_format=ActionFormat.SINGLE_ARM_10D,
|
| 182 |
to_opencv=_FRANKA_TO_OPENCV,
|
|
|
|
| 183 |
),
|
| 184 |
"robomind_franka_dual": _lazycfg_to_entry(
|
| 185 |
DATASET_ROBOMIND_FRANKA_DUAL_480,
|
|
|
|
| 196 |
dtype=np.float32,
|
| 197 |
),
|
| 198 |
to_opencv=_FRANKA_TO_OPENCV,
|
|
|
|
| 199 |
),
|
| 200 |
}
|
| 201 |
|
|
|
|
| 223 |
|
| 224 |
# UMI: factory function
|
| 225 |
if callable(cls) and not inspect.isclass(cls):
|
| 226 |
+
_OMEGACONF_BLOCKLIST = {"chunk_length", "split", "enable_fast_init"}
|
| 227 |
kwargs = {k: v for k, v in kwargs.items() if k not in _OMEGACONF_BLOCKLIST}
|
| 228 |
kwargs["eager_load"] = True
|
| 229 |
return cls(**kwargs)
|
|
|
|
| 297 |
Rot("rot6d", prefix="right"),
|
| 298 |
Gripper(prefix="right"),
|
| 299 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
raise ValueError(f"Unsupported action format for idle-frame detection: {action_format}")
|
| 301 |
|
| 302 |
|
|
|
|
| 596 |
cam_panel = client.gui.add_image(np.zeros((64, 64, 3), dtype=np.uint8))
|
| 597 |
renderer.set_video_panel(cam_panel)
|
| 598 |
|
| 599 |
+
with client.gui.add_folder("Action"):
|
| 600 |
action_text = client.gui.add_markdown("*No episode loaded*")
|
| 601 |
|
| 602 |
show = {
|
|
|
|
| 619 |
)
|
| 620 |
|
| 621 |
def _update_action_text(t: int) -> None:
|
| 622 |
+
"""Update the action display for one client."""
|
| 623 |
txt = renderer.format_action_text(t)
|
| 624 |
action_text.content = txt if txt else "*No data*"
|
| 625 |
|
|
|
|
| 754 |
)
|
| 755 |
state.video = get_video_from_sample(sample)
|
| 756 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
status_text.content = "⏳ Loading robot animation..."
|
| 758 |
renderer.load(state, entry, to_opencv=to_opencv)
|
| 759 |
_rebuild_robot_frame_toggles()
|
|
|
|
| 771 |
+ (f"Task: {ai_caption_text}\n\n" if ai_caption_text else "")
|
| 772 |
+ (f"Debug: {debug_caption_text}\n\n" if debug_caption_text else "")
|
| 773 |
+ (
|
| 774 |
+
f"Steps: {T} | Raw action: {raw_action_label} ({action_raw.shape[-1]}D) | "
|
| 775 |
f"Robot: {entry.robot_name or '—'} | FPS: {entry.fps}"
|
| 776 |
)
|
| 777 |
)
|
|
|
|
| 913 |
|
| 914 |
|
| 915 |
def main():
|
| 916 |
+
parser = argparse.ArgumentParser(description="Action dataset viewer")
|
| 917 |
parser.add_argument("--port", type=int, default=8013)
|
| 918 |
parser.add_argument("--share", action="store_true")
|
| 919 |
parser.add_argument("--chunk-length", type=int, default=16)
|