diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..6a1b0fdbd5e7cecb77b9d047eb1827577be4ba0f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,15 @@ +.git +__pycache__ +*.pyc +*.pyo +*.pyd +.cache +.venv +venv +.env +.DS_Store +data + +# Exclude vendored LFS pointer assets not needed by the viewer. +cosmos-framework/inputs/omni/01_hammer_nail.png +cosmos-framework/assets/cosmos-logo-thumbnail.png diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..f844a4d4bbbaf7733c8d0b66962e833baab69a40 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,20 @@ +assets/examples/bridge_lerobot_v3/meta/episodes/chunk-000/file-000.parquet filter=lfs diff=lfs merge=lfs -text +assets/examples/bridge_lerobot_v3/videos/observation.images.image_0/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/episodes/chunk-000/file-000.parquet filter=lfs diff=lfs merge=lfs -text +assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/data/chunk-000/file-000.parquet filter=lfs diff=lfs merge=lfs -text +assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/episodes/chunk-000/file-000.parquet filter=lfs diff=lfs merge=lfs -text +assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_front/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_left/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_right/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/droid_plus_lerobot_640x360_20260412/success/data/chunk-000/file-000.parquet filter=lfs diff=lfs merge=lfs -text +assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/episodes/chunk-000/file-000.parquet filter=lfs diff=lfs merge=lfs -text +assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.exterior_image_1_left/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.exterior_image_2_left/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.wrist_image_left/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/fractal20220817_data/videos/observation.images.image/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/av_v2_03292026_wdinfo/data/00000000.tar filter=lfs diff=lfs merge=lfs -text +assets/examples/fastumi/fastumi_single_arm/pour_coke/videos/observation.image.right_main_camera_rgb/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/AgiBotWorld-GEAR_20260208/agibot-offshelf/20251016_500h/gripper/task_1018/data/chunk-000/file_000.parquet filter=lfs diff=lfs merge=lfs -text +assets/examples/AgiBotWorld-GEAR_20260208/agibot-offshelf/20251016_500h/gripper/task_1018/videos/observation.images.hand_left/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/AgiBotWorld-GEAR_20260208/agibot-offshelf/20251016_500h/gripper/task_1018/videos/observation.images.hand_right/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text +assets/examples/AgiBotWorld-GEAR_20260208/agibot-offshelf/20251016_500h/gripper/task_1018/videos/observation.images.top_head/chunk-000/file-000.mp4 filter=lfs diff=lfs merge=lfs -text diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..41cd670c140529d471f38cca0c4e72b3a15a6572 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.11-slim + +ENV PYTHONUNBUFFERED=1 \ + PORT=7860 \ + PIP_NO_CACHE_DIR=1 \ + HF_HUB_DISABLE_XET=1 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + git curl ca-certificates ffmpeg build-essential linux-libc-dev \ + libgl1 libglib2.0-0 libegl1 libosmesa6 libgomp1 + +WORKDIR /app + +COPY cosmos-framework /app/cosmos-framework +COPY assets /app/assets +COPY start.sh /app/start.sh + +RUN python -m pip install --upgrade pip uv \ + && uv pip install --system torch==2.6.0 torchvision==0.21.0 --index-url https://download.pytorch.org/whl/cpu \ + && uv pip install --system -e /app/cosmos-framework --no-deps \ + && uv pip install --system \ + accelerate av cattrs diffusers einops hydra-core imageio imageio-ffmpeg loguru msgpack \ + obstore omegaconf pydantic requests scipy termcolor 'transformers>=4.57.1,<5.0.0' tyro websockets \ + boto3 pyyaml lerobot==0.4.4 pandas pyarrow 'datasets>=2.19.0' \ + viser mujoco pin trimesh opencv-python-headless huggingface_hub hf_transfer iopath \ + && uv pip install --system torchcodec==0.2.* --index-url https://download.pytorch.org/whl/cpu + +CMD ["bash", "/app/start.sh"] diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b98538d55c9dc5f590274b4ed808a22e6eb31da8 --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +--- +title: Cosmos3 Action Viewer +emoji: 🤖 +colorFrom: blue +colorTo: green +sdk: docker +app_port: 7860 +pinned: false +license: other +--- + +# Cosmos3 Action Viewer + +Minimal Docker Space for a standalone Cosmos3 action data viewer. + +Packaged local demo data under `assets/examples/`: + +- Bridge: 2 local episodes +- Fractal: 1 local episode +- DROID: 1 local success episode +- RoboMIND Franka: 1 local episode +- RoboMIND Franka dual: 1 local episode +- AV: 1 local wdinfo/tar shard with 10 samples +- UMI/FastUMI: 1 local episode + +Runtime behavior: no HF dataset download or video streaming by default. The app runs the existing `viser` URDF/action viewer on the Hugging Face Spaces port. + +Environment overrides are still supported for larger mounted datasets: + +- `BRIDGE_LEROBOT_ROOT` +- `FRACTAL_ROOT` +- `DROID_ROOT` +- `ROBOMIND_FRANKA_ROOT` +- `ROBOMIND_FRANKA_DUAL_ROOT` +- `AV_ROOT` +- `UMI_ROOT` diff --git a/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/data/chunk-000/file-000.parquet b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/data/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f326e41d69bdb2d346df45446c24dee98fb34b8c Binary files /dev/null and b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/data/chunk-000/file-000.parquet differ diff --git a/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/manifest.json b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..5a272fd49a45dfa39d7ab561a1e5fbf687814660 --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/manifest.json @@ -0,0 +1,21 @@ +{ + "source_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1", + "source_episode": 0, + "local_episode": 0, + "source_index_range": [ + 0, + 120 + ], + "frames": 120, + "source_task_indices": [ + 0 + ], + "tasks": [ + "Close the trash can by pressing down from the back." + ], + "videos": [ + "videos/observation.images.camera_top/chunk-000/file-000.mp4", + "videos/observation.images.camera_left/chunk-000/file-000.mp4", + "videos/observation.images.camera_right/chunk-000/file-000.mp4" + ] +} diff --git a/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/episodes/chunk-000/file-000.parquet b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/episodes/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..76040781e80c0821794074b9bcfcb601e1d24b69 --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/episodes/chunk-000/file-000.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdbf9cef8d1a5d04e90459c24f3b2092c598ebf3459fc90740357b340bb08faf +size 123148 diff --git a/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/info.json b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/info.json new file mode 100644 index 0000000000000000000000000000000000000000..2cd0c1894397773980643a7406b0e59f124eaf1c --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/info.json @@ -0,0 +1,174 @@ +{ + "codebase_version": "v3.0", + "robot_type": "franka_3rgb", + "total_episodes": 1, + "total_frames": 120, + "total_tasks": 1, + "chunks_size": 1000, + "data_files_size_in_mb": 100, + "video_files_size_in_mb": 200, + "fps": 30, + "splits": { + "train": "0:1" + }, + "data_path": "data/chunk-{chunk_index:03d}/file-{file_index:03d}.parquet", + "video_path": "videos/{video_key}/chunk-{chunk_index:03d}/file-{file_index:03d}.mp4", + "features": { + "observation.images.camera_top": { + "dtype": "video", + "shape": [ + 720, + 1280, + 3 + ], + "names": [ + "height", + "width", + "rgb" + ], + "info": { + "video.height": 720, + "video.width": 1280, + "video.codec": "av1", + "video.pix_fmt": "yuv420p", + "video.is_depth_map": false, + "video.fps": 30, + "video.channels": 3, + "has_audio": false + } + }, + "observation.images.camera_left": { + "dtype": "video", + "shape": [ + 480, + 640, + 3 + ], + "names": [ + "height", + "width", + "rgb" + ], + "info": { + "video.height": 480, + "video.width": 640, + "video.codec": "av1", + "video.pix_fmt": "yuv420p", + "video.is_depth_map": false, + "video.fps": 30, + "video.channels": 3, + "has_audio": false + } + }, + "observation.images.camera_right": { + "dtype": "video", + "shape": [ + 480, + 640, + 3 + ], + "names": [ + "height", + "width", + "rgb" + ], + "info": { + "video.height": 480, + "video.width": 640, + "video.codec": "av1", + "video.pix_fmt": "yuv420p", + "video.is_depth_map": false, + "video.fps": 30, + "video.channels": 3, + "has_audio": false + } + }, + "observation.states.end_effector": { + "dtype": "float32", + "shape": [ + 6 + ], + "names": { + "motors": [ + "x", + "y", + "z", + "r", + "p", + "y" + ] + } + }, + "observation.states.joint_position": { + "dtype": "float32", + "shape": [ + 8 + ], + "names": { + "motors": [ + "joint_0", + "joint_1", + "joint_2", + "joint_3", + "joint_4", + "joint_5", + "joint_6", + "gripper" + ] + } + }, + "actions.joint_position": { + "dtype": "float32", + "shape": [ + 8 + ], + "names": { + "motors": [ + "joint_0", + "joint_1", + "joint_2", + "joint_3", + "joint_4", + "joint_5", + "joint_6", + "gripper" + ] + } + }, + "timestamp": { + "dtype": "float32", + "shape": [ + 1 + ], + "names": null + }, + "frame_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "episode_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "task_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + } + } +} diff --git a/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/stats.json b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/stats.json new file mode 100644 index 0000000000000000000000000000000000000000..6ddb9b8a88e3910f8ebd5f76fc2111cbf002c7e8 --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/stats.json @@ -0,0 +1,903 @@ +{ + "timestamp": { + "min": [ + 0.0 + ], + "max": [ + 10.933333333333334 + ], + "mean": [ + 1.9684300844334923 + ], + "std": [ + 1.3673979573511015 + ], + "count": [ + 31662 + ], + "q01": [ + 0.02376322090207472 + ], + "q10": [ + 0.37904119386356827 + ], + "q50": [ + 1.959688231107258 + ], + "q90": [ + 3.553989463285689 + ], + "q99": [ + 3.9125398910217504 + ] + }, + "observation.states.end_effector": { + "min": [ + 0.5030578374862671, + -0.39535877108573914, + 0.4033958315849304, + -3.141587972640991, + -0.870353639125824, + -1.7021530866622925 + ], + "max": [ + 0.7792795300483704, + 0.4450030028820038, + 0.6821127533912659, + 3.1415915489196777, + 0.25157487392425537, + 1.1252886056900024 + ], + "mean": [ + 0.6255840464337402, + 0.012989276388556903, + 0.5490484980667585, + 0.1625279631562389, + -0.20268731521655853, + -0.22077285899314192 + ], + "std": [ + 0.06039222410375044, + 0.1449360540909791, + 0.06623187408317564, + 2.991057969862546, + 0.20245307551388686, + 0.5210916269783522 + ], + "count": [ + 31662 + ], + "q01": [ + 0.5561097662572897, + -0.09380602512150898, + 0.4346114056532645, + -1.8705973324493732, + -0.5094947386697036, + -0.6075075655854922 + ], + "q10": [ + 0.5632028311365838, + -0.08542720523581471, + 0.4456098187752242, + -1.6207854327266906, + -0.4610935596481184, + -0.5787010975715685 + ], + "q50": [ + 0.6348575462982353, + 0.015158164848026623, + 0.5696579518852141, + 0.005318343743618634, + -0.19820279019169973, + -0.2654616249987716 + ], + "q90": [ + 0.6865314425143914, + 0.10741322351539781, + 0.6143608893539235, + 2.0985132183140247, + -0.01623157041115065, + 0.15955954879741835 + ], + "q99": [ + 0.6927857513943124, + 0.11675971906958119, + 0.6229099651133552, + 2.696355882965309, + 0.0003145221689061533, + 0.1871979781045757 + ] + }, + "index": { + "min": [ + 0 + ], + "max": [ + 31661 + ], + "mean": [ + 15830.5 + ], + "std": [ + 9140.032106982264 + ], + "count": [ + 31662 + ], + "q01": [ + 15772.159994096948 + ], + "q10": [ + 15782.81833328525 + ], + "q50": [ + 15830.23856370413 + ], + "q90": [ + 15878.066781363226 + ], + "q99": [ + 15888.823294194774 + ] + }, + "observation.images.camera_left": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.5053939524871568 + ] + ], + [ + [ + 0.5129378107679405 + ] + ], + [ + [ + 0.5038425321864779 + ] + ] + ], + "std": [ + [ + [ + 0.006196154475182948 + ] + ], + [ + [ + 0.006291749597368619 + ] + ], + [ + [ + 0.007139137590419621 + ] + ] + ], + "count": [ + 26766 + ], + "q01": [ + [ + [ + 0.050804214742590795 + ] + ], + [ + [ + 0.06535308523113001 + ] + ], + [ + [ + 0.05578779593795524 + ] + ] + ], + "q10": [ + [ + [ + 0.3346875522241501 + ] + ], + [ + [ + 0.3538745545006605 + ] + ], + [ + [ + 0.3279563043968654 + ] + ] + ], + "q50": [ + [ + [ + 0.5512415615575815 + ] + ], + [ + [ + 0.557746025835581 + ] + ], + [ + [ + 0.5442588739492085 + ] + ] + ], + "q90": [ + [ + [ + 0.5930081145488154 + ] + ], + [ + [ + 0.6046346760047809 + ] + ], + [ + [ + 0.6083802395861199 + ] + ] + ], + "q99": [ + [ + [ + 0.6335368100177181 + ] + ], + [ + [ + 0.6387606491834414 + ] + ], + [ + [ + 0.6516727281816006 + ] + ] + ] + }, + "actions.joint_position": { + "min": [ + -0.4739997386932373, + -0.1978835165500641, + -0.5645049214363098, + -1.863785982131958, + -0.5414947271347046, + 1.394435167312622, + -0.8218851685523987, + 0.0 + ], + "max": [ + 0.7025632262229919, + 0.6534756422042847, + 0.23316508531570435, + -1.1949712038040161, + 0.6181789040565491, + 2.988194465637207, + 1.5723140239715576, + 1.0 + ], + "mean": [ + 0.08852916841457183, + 0.20379400446678728, + -0.07347157121414831, + -1.560854356627464, + -0.027502359479121323, + 2.011894774820232, + 0.2276453738724793, + 0.6471313536858972 + ], + "std": [ + 0.18431886493226315, + 0.18677245861500938, + 0.09783784951187993, + 0.10504809264513078, + 0.09190849461431556, + 0.3786779746935214, + 0.4578454919804973, + 0.4530472127902388 + ], + "count": [ + 31662 + ], + "q01": [ + -0.059593112691433844, + -0.03240261068188407, + -0.1217730086501231, + -1.6653622219651423, + -0.10857946302997483, + 1.609264057957687, + -0.13058724798860544, + -1.000000013351432e-10 + ], + "q10": [ + -0.04501642076638965, + -0.00871990155880731, + -0.11709558777130803, + -1.6493733834129907, + -0.09368463558629822, + 1.628428367281585, + -0.10049625833210918, + 0.0002743634253506962 + ], + "q50": [ + 0.09818325634298403, + 0.21136092309578877, + -0.08057754637075476, + -1.5806816619484865, + -0.023714171368019024, + 2.0448558221656024, + 0.28801769232764407, + 0.9181340869801482 + ], + "q90": [ + 0.21040525446981903, + 0.41854284251779594, + -0.023043113030900307, + -1.4390443497453502, + 0.03691071807769244, + 2.4299963875273014, + 0.5117685732510093, + 0.9967792755991135 + ], + "q99": [ + 0.22353138137318673, + 0.45129443785420625, + -0.01353270384012705, + -1.4236844133752204, + 0.05254005867616058, + 2.4802114284864163, + 0.5369015031084761, + 0.9971542697572874 + ] + }, + "observation.images.camera_top": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 0.984313725490196 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.4628883239385903 + ] + ], + [ + [ + 0.4678657429689597 + ] + ], + [ + [ + 0.4591371632804239 + ] + ] + ], + "std": [ + [ + [ + 0.005320493668924655 + ] + ], + [ + [ + 0.005264511466064699 + ] + ], + [ + [ + 0.005299951762028905 + ] + ] + ], + "count": [ + 26766 + ], + "q01": [ + [ + [ + 0.048390354929310465 + ] + ], + [ + [ + 0.05576798281595686 + ] + ], + [ + [ + 0.04296608011754174 + ] + ] + ], + "q10": [ + [ + [ + 0.3990018407803951 + ] + ], + [ + [ + 0.39802162530680263 + ] + ], + [ + [ + 0.3742076209782547 + ] + ] + ], + "q50": [ + [ + [ + 0.48167112014209434 + ] + ], + [ + [ + 0.48865381487960363 + ] + ], + [ + [ + 0.481724617238358 + ] + ] + ], + "q90": [ + [ + [ + 0.5398092052009987 + ] + ], + [ + [ + 0.5407023438466012 + ] + ], + [ + [ + 0.5376467137497242 + ] + ] + ], + "q99": [ + [ + [ + 0.5786545035149396 + ] + ], + [ + [ + 0.5749881050131567 + ] + ], + [ + [ + 0.5828155228727975 + ] + ] + ] + }, + "observation.images.camera_right": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 0.9568627450980393 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.4753470970603862 + ] + ], + [ + [ + 0.48258902097804496 + ] + ], + [ + [ + 0.47228931382713346 + ] + ] + ], + "std": [ + [ + [ + 0.007750263540936345 + ] + ], + [ + [ + 0.007283703931002962 + ] + ], + [ + [ + 0.00728236419430919 + ] + ] + ], + "count": [ + 26766 + ], + "q01": [ + [ + [ + 0.06011670652045735 + ] + ], + [ + [ + 0.07469315368796249 + ] + ], + [ + [ + 0.07407955894489102 + ] + ] + ], + "q10": [ + [ + [ + 0.31715286275651944 + ] + ], + [ + [ + 0.3215254711229003 + ] + ], + [ + [ + 0.30353990994166524 + ] + ] + ], + "q50": [ + [ + [ + 0.5199269091079658 + ] + ], + [ + [ + 0.5305114367150816 + ] + ], + [ + [ + 0.5209989345006634 + ] + ] + ], + "q90": [ + [ + [ + 0.5632998346913191 + ] + ], + [ + [ + 0.5666944392017282 + ] + ], + [ + [ + 0.5606875699288839 + ] + ] + ], + "q99": [ + [ + [ + 0.5893329859944269 + ] + ], + [ + [ + 0.5858415383992885 + ] + ], + [ + [ + 0.5841699172349623 + ] + ] + ] + }, + "task_index": { + "min": [ + 0 + ], + "max": [ + 0 + ], + "mean": [ + 0.0 + ], + "std": [ + 0.0 + ], + "count": [ + 31662 + ], + "q01": [ + 3.999999999999416e-16 + ], + "q10": [ + 3.999999999999416e-15 + ], + "q50": [ + 1.999999999999708e-14 + ], + "q90": [ + 3.599999999999475e-14 + ], + "q99": [ + 3.959999999999426e-14 + ] + }, + "observation.states.joint_position": { + "min": [ + -0.4792475700378418, + -0.17303875088691711, + -0.5550373196601868, + -1.7863118648529053, + -0.556555449962616, + 1.4112317562103271, + -0.7897247076034546, + 0.0 + ], + "max": [ + 0.6933449506759644, + 0.6483161449432373, + 0.24008235335350037, + -1.2061364650726318, + 0.5948796272277832, + 2.9671719074249268, + 1.5412038564682007, + 0.9955947399139404 + ], + "mean": [ + 0.08307541985392064, + 0.21595505593653658, + -0.06930492230259129, + -1.5465318743318581, + -0.027131365655145914, + 2.014955148517833, + 0.23194884678556466, + 0.6496123414561716 + ], + "std": [ + 0.18346338992543598, + 0.18525191351178247, + 0.09729970935484367, + 0.09688257543821313, + 0.08885077550896962, + 0.37816230296399816, + 0.4538132280198466, + 0.43788230781758464 + ], + "count": [ + 31662 + ], + "q01": [ + -0.059983247997501, + -0.008089137494155725, + -0.11355956738959826, + -1.6362844787051258, + -0.09748544384830576, + 1.6202775963149094, + -0.107512672453896, + -1.000000013351432e-10 + ], + "q10": [ + -0.046061865426954195, + 0.011928289285986255, + -0.10995279671210136, + -1.6234894199315064, + -0.0875393795698981, + 1.635005122528732, + -0.08487479505434022, + 0.0013078465886770091 + ], + "q50": [ + 0.09312705711493158, + 0.21976722762964496, + -0.07653552368003404, + -1.5617101012725234, + -0.021237679524713282, + 2.0507582705260785, + 0.29536171888560564, + 0.9110621240012867 + ], + "q90": [ + 0.20282499261463963, + 0.42640622785173465, + -0.021370758112509814, + -1.443304358172678, + 0.027971435875220843, + 2.423922345921811, + 0.5012210673780202, + 0.9885988833533259 + ], + "q99": [ + 0.21288423933120867, + 0.44085273069370573, + -0.01217677257791173, + -1.4343831924021644, + 0.036407714533119744, + 2.4646011820637157, + 0.515795658515525, + 0.9889299547353837 + ] + }, + "episode_index": { + "min": [ + 0 + ], + "max": [ + 288 + ], + "mean": [ + 144.28924262522898 + ], + "std": [ + 84.0818915388226 + ], + "count": [ + 31662 + ], + "q01": [ + 144.28924262522898 + ], + "q10": [ + 144.28924262522898 + ], + "q50": [ + 144.28924262522898 + ], + "q90": [ + 144.28924262522898 + ], + "q99": [ + 144.28924262522898 + ] + }, + "frame_index": { + "min": [ + 0 + ], + "max": [ + 328 + ], + "mean": [ + 59.05290253300486 + ], + "std": [ + 41.021938720533086 + ], + "count": [ + 31662 + ], + "q01": [ + 0.7128966299307732 + ], + "q10": [ + 11.371235818250309 + ], + "q50": [ + 58.79025514496828 + ], + "q90": [ + 106.61968389623351 + ], + "q99": [ + 117.37619672778504 + ] + } +} \ No newline at end of file diff --git a/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/tasks.parquet b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/tasks.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9066bd7f9cd0a39622b670e1918aff9d612c788d Binary files /dev/null and b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/meta/tasks.parquet differ diff --git a/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/videos/observation.images.camera_left/chunk-000/file-000.mp4 b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/videos/observation.images.camera_left/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..47905c0158a7469609f4612f7e08db05970ec124 Binary files /dev/null and b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/videos/observation.images.camera_left/chunk-000/file-000.mp4 differ diff --git a/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/videos/observation.images.camera_right/chunk-000/file-000.mp4 b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/videos/observation.images.camera_right/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..111035744d30471db537891406bae62f98208507 Binary files /dev/null and b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/videos/observation.images.camera_right/chunk-000/file-000.mp4 differ diff --git a/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/videos/observation.images.camera_top/chunk-000/file-000.mp4 b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/videos/observation.images.camera_top/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..1c0fad196776cc4d56f0c3d94d3ecbc181e6097e Binary files /dev/null and b/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1/videos/observation.images.camera_top/chunk-000/file-000.mp4 differ diff --git a/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/data/chunk-000/file-000.parquet b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/data/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7edc84f5f1a3217bcc31634a6f8da80caf7c4c73 --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/data/chunk-000/file-000.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9053fdb89f8f6990015201d1fc321e1b3999735e575e7c2f35b7fbf12ccbc4e +size 134374 diff --git a/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/manifest.json b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..f4ea682d23e12618686ab113581d18dc3cfa3b06 --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/manifest.json @@ -0,0 +1,21 @@ +{ + "source_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water", + "source_episode": 0, + "local_episode": 0, + "source_index_range": [ + 0, + 574 + ], + "frames": 574, + "source_task_indices": [ + 0 + ], + "tasks": [ + "pour water with both arm" + ], + "videos": [ + "videos/observation.images.camera_front/chunk-000/file-000.mp4", + "videos/observation.images.camera_left/chunk-000/file-000.mp4", + "videos/observation.images.camera_right/chunk-000/file-000.mp4" + ] +} diff --git a/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/episodes/chunk-000/file-000.parquet b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/episodes/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..12a94aa07ae60a89aebbff4080ce039bb1d082a6 --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/episodes/chunk-000/file-000.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bbb518c7f9c370968278cc3746c3526bd7a09758630773ac4193e33eec7f067 +size 140145 diff --git a/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/info.json b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/info.json new file mode 100644 index 0000000000000000000000000000000000000000..98a9300150c3c9d4ac2ea400a8742e722a9d17e4 --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/info.json @@ -0,0 +1,186 @@ +{ + "codebase_version": "v3.0", + "robot_type": "franka_fr3_dual", + "total_episodes": 1, + "total_frames": 574, + "total_tasks": 1, + "chunks_size": 1000, + "data_files_size_in_mb": 100, + "video_files_size_in_mb": 200, + "fps": 30, + "splits": { + "train": "0:1" + }, + "data_path": "data/chunk-{chunk_index:03d}/file-{file_index:03d}.parquet", + "video_path": "videos/{video_key}/chunk-{chunk_index:03d}/file-{file_index:03d}.mp4", + "features": { + "observation.images.camera_front": { + "dtype": "video", + "shape": [ + 720, + 1280, + 3 + ], + "names": [ + "height", + "width", + "rgb" + ], + "info": { + "video.height": 720, + "video.width": 1280, + "video.codec": "av1", + "video.pix_fmt": "yuv420p", + "video.is_depth_map": false, + "video.fps": 30, + "video.channels": 3, + "has_audio": false + } + }, + "observation.images.camera_left": { + "dtype": "video", + "shape": [ + 480, + 640, + 3 + ], + "names": [ + "height", + "width", + "rgb" + ], + "info": { + "video.height": 480, + "video.width": 640, + "video.codec": "av1", + "video.pix_fmt": "yuv420p", + "video.is_depth_map": false, + "video.fps": 30, + "video.channels": 3, + "has_audio": false + } + }, + "observation.images.camera_right": { + "dtype": "video", + "shape": [ + 480, + 640, + 3 + ], + "names": [ + "height", + "width", + "rgb" + ], + "info": { + "video.height": 480, + "video.width": 640, + "video.codec": "av1", + "video.pix_fmt": "yuv420p", + "video.is_depth_map": false, + "video.fps": 30, + "video.channels": 3, + "has_audio": false + } + }, + "observation.states.end_effector": { + "dtype": "float32", + "shape": [ + 12 + ], + "names": { + "motors": [ + "left_xyzrpy", + "right_xyzrpy" + ] + } + }, + "observation.states.joint_position": { + "dtype": "float32", + "shape": [ + 16 + ], + "names": { + "motors": [ + "left_joint_0", + "left_joint_1", + "left_joint_2", + "left_joint_3", + "left_joint_4", + "left_joint_5", + "left_joint_6", + "left_gripper", + "right_joint_0", + "right_joint_1", + "right_joint_2", + "right_joint_3", + "right_joint_4", + "right_joint_5", + "right_joint_6", + "right_gripper" + ] + } + }, + "actions.joint_position": { + "dtype": "float32", + "shape": [ + 16 + ], + "names": { + "motors": [ + "left_joint_0", + "left_joint_1", + "left_joint_2", + "left_joint_3", + "left_joint_4", + "left_joint_5", + "left_joint_6", + "left_gripper", + "right_joint_0", + "right_joint_1", + "right_joint_2", + "right_joint_3", + "right_joint_4", + "right_joint_5", + "right_joint_6", + "right_gripper" + ] + } + }, + "timestamp": { + "dtype": "float32", + "shape": [ + 1 + ], + "names": null + }, + "frame_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "episode_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "task_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + } + } +} diff --git a/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/stats.json b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/stats.json new file mode 100644 index 0000000000000000000000000000000000000000..b088c8517e90267f193cebfde742a8d21a2a2a60 --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/stats.json @@ -0,0 +1,1259 @@ +{ + "observation.images.camera_top": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.48376929793303225 + ] + ], + [ + [ + 0.4674950063659832 + ] + ], + [ + [ + 0.42110986085802427 + ] + ] + ], + "std": [ + [ + [ + 0.0034515567737140648 + ] + ], + [ + [ + 0.003152313608076121 + ] + ], + [ + [ + 0.0030219598760090854 + ] + ] + ], + "count": [ + 13472 + ], + "q01": [ + [ + [ + 0.09619084794816744 + ] + ], + [ + [ + 0.0870123248734442 + ] + ], + [ + [ + 0.0430602865244451 + ] + ] + ], + "q10": [ + [ + [ + 0.2804504373204744 + ] + ], + [ + [ + 0.2611546733612462 + ] + ], + [ + [ + 0.19861621336452784 + ] + ] + ], + "q50": [ + [ + [ + 0.4573825435387216 + ] + ], + [ + [ + 0.42278935615332147 + ] + ], + [ + [ + 0.37139849968994815 + ] + ] + ], + "q90": [ + [ + [ + 0.7830895781354413 + ] + ], + [ + [ + 0.8220906539922425 + ] + ], + [ + [ + 0.8252257020249846 + ] + ] + ], + "q99": [ + [ + [ + 0.8586179042245964 + ] + ], + [ + [ + 0.9050680340394492 + ] + ], + [ + [ + 0.9365709680791219 + ] + ] + ] + }, + "actions.joint_position": { + "min": [ + -0.5353593230247498, + -0.754718542098999, + -0.777728259563446, + -2.7626993656158447, + -1.0722525119781494, + 1.2210487127304077, + -0.6657305955886841, + 0.0, + -0.36201930046081543, + -0.1242208406329155, + -0.685689389705658, + -1.9895726442337036, + -0.8651651740074158, + 1.334547996520996, + -0.7146927714347839, + 0.00037202381645329297 + ], + "max": [ + 0.859029233455658, + 0.509265661239624, + 0.653475821018219, + -1.3130875825881958, + 0.3420293629169464, + 2.903825521469116, + 1.3529239892959595, + 1.0, + 0.7976546883583069, + 0.9572038650512695, + 0.8713011145591736, + -0.8973791003227234, + 0.5967175960540771, + 2.6613783836364746, + 1.9404549598693848, + 1.0 + ], + "mean": [ + 0.1861865432166611, + -0.017391271701913246, + -0.02958098278051404, + -1.8821133989849286, + -0.25689289227268475, + 2.1217277282727984, + 0.3108133571587931, + 0.6812934040810689, + 0.11169839708835208, + 0.2291464669503562, + 0.12317132691269464, + -1.4440139117058302, + -0.046060681490781845, + 1.781798902100068, + 0.10212216518653346, + 0.2543521764761343 + ], + "std": [ + 0.29527247764323017, + 0.2941846330159817, + 0.22514641919837344, + 0.3761867395993795, + 0.2813910406889975, + 0.34057403940537867, + 0.4628391164924103, + 0.4555740774662373, + 0.18822520803853124, + 0.22782164980246536, + 0.16998464278726325, + 0.10402822172948001, + 0.18946423032995677, + 0.376722385852282, + 0.360779166884318, + 0.40944824170478805 + ], + "count": [ + 58885 + ], + "q01": [ + -0.20556848787213558, + -0.5508892892828418, + -0.2802029428874371, + -2.541712034898689, + -0.8017318025624431, + 1.488068518107089, + -0.23624862588524992, + 0.0006147995589559869, + -0.07258994513304466, + 0.024481408466384767, + -0.006644251635662588, + -1.6272202850076025, + -0.3526667724106083, + 1.4679249908600462, + -0.22860762983618546, + 0.009139926723121927 + ], + "q10": [ + -0.13583602028636654, + -0.4679717799673135, + -0.21758269732104135, + -2.4371744491637686, + -0.7103455442426116, + 1.6071025304281203, + -0.1550291626558284, + 0.0015784068204869187, + -0.012934452240419532, + 0.05992092372297601, + 0.025861471778839268, + -1.5266868434276748, + -0.3071736542346529, + 1.5083107376661646, + -0.12882271449454785, + 0.010191584920591464 + ], + "q50": [ + 0.18507903561119993, + 0.017982969879327675, + -0.055829753724253084, + -1.6899328910829658, + -0.1419832568784388, + 2.214857399388195, + 0.13183062360135028, + 0.992399260574226, + 0.02703344158757793, + 0.08550643413111958, + 0.083286809780891, + -1.4510293805412144, + 0.002452545848371913, + 1.564874178701891, + -0.010616476610398782, + 0.011183737529185007 + ], + "q90": [ + 0.5609048900649231, + 0.33752525622000595, + 0.21219738654312226, + -1.5041547973889244, + 0.012592934232438515, + 2.498023274709365, + 0.9869082449794297, + 0.9957715569370339, + 0.3374210314229987, + 0.5684143291150674, + 0.2897333408268615, + -1.342906272439828, + 0.1256424085184901, + 2.3885623965208724, + 0.7470419269975446, + 0.9970613443241874 + ], + "q99": [ + 0.5904568356868924, + 0.3779781713728937, + 0.27680301146924324, + -1.440851923136264, + 0.09058589083479192, + 2.5634236203261356, + 1.0779513714773268, + 0.9966748272296234, + 0.5223264767471835, + 0.6776964674736506, + 0.3301378429046773, + -1.2880005871811373, + 0.2726828715550844, + 2.4654702589781974, + 1.0866745698876468, + 0.9988248189214447 + ] + }, + "observation.images.camera_right": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.46124731880052827 + ] + ], + [ + [ + 0.4553679300835038 + ] + ], + [ + [ + 0.42334209907828685 + ] + ] + ], + "std": [ + [ + [ + 0.01039969061638479 + ] + ], + [ + [ + 0.010249261818554806 + ] + ], + [ + [ + 0.010057614941827324 + ] + ] + ], + "count": [ + 13472 + ], + "q01": [ + [ + [ + 0.059261110210909615 + ] + ], + [ + [ + 0.07429608369146787 + ] + ], + [ + [ + 0.06033915532354615 + ] + ] + ], + "q10": [ + [ + [ + 0.20528986731553428 + ] + ], + [ + [ + 0.20506945620878664 + ] + ], + [ + [ + 0.1643094107999931 + ] + ] + ], + "q50": [ + [ + [ + 0.5191681591791838 + ] + ], + [ + [ + 0.4810674129951322 + ] + ], + [ + [ + 0.3885047815405505 + ] + ] + ], + "q90": [ + [ + [ + 0.6589198999415846 + ] + ], + [ + [ + 0.6763365405598278 + ] + ], + [ + [ + 0.6761300090592645 + ] + ] + ], + "q99": [ + [ + [ + 0.7465792160987124 + ] + ], + [ + [ + 0.7698546910074944 + ] + ], + [ + [ + 0.7657271498394176 + ] + ] + ] + }, + "episode_index": { + "min": [ + 0 + ], + "max": [ + 128 + ], + "mean": [ + 64.18070815997282 + ], + "std": [ + 36.93854045783521 + ], + "count": [ + 58885 + ], + "q01": [ + 64.18070815997282 + ], + "q10": [ + 64.18070815997282 + ], + "q50": [ + 64.18070815997284 + ], + "q90": [ + 64.18070815997284 + ], + "q99": [ + 64.18070815997284 + ] + }, + "observation.states.joint_position": { + "min": [ + -0.5184276700019836, + -0.7133806943893433, + -0.7727153897285461, + -2.7567901611328125, + -1.0469406843185425, + 1.2608721256256104, + -0.6555339097976685, + 0.0, + -0.3380506634712219, + -0.0967535451054573, + -0.6921591758728027, + -1.9839798212051392, + -0.8350218534469604, + 1.3372937440872192, + -0.678467869758606, + 0.0 + ], + "max": [ + 0.8559838533401489, + 0.5060783624649048, + 0.6553268432617188, + -1.3234312534332275, + 0.3107798099517822, + 2.8862950801849365, + 1.3210870027542114, + 0.37444934248924255, + 0.796474039554596, + 0.9370087385177612, + 0.8579978942871094, + -0.8903438448905945, + 0.5705297589302063, + 2.6416311264038086, + 1.9162824153900146, + 0.9823788404464722 + ], + "mean": [ + 0.1899299091718875, + -0.006983510289951079, + -0.03201154206908776, + -1.8901113197072066, + -0.25972718498829694, + 2.1236042977917977, + 0.3134960712148691, + 0.25232693715584215, + 0.1181601202667235, + 0.2352566502086829, + 0.11865091471510701, + -1.4396272846927078, + -0.050641272852852554, + 1.7804795026374287, + 0.10953031788464111, + 0.24614797891264326 + ], + "std": [ + 0.2920662065860597, + 0.2829727697192382, + 0.2250152383672195, + 0.3739207836802478, + 0.2706061997178126, + 0.32875287858479824, + 0.4545940888251623, + 0.16491406981261394, + 0.18521363874518934, + 0.22110645365448872, + 0.1681570365055575, + 0.10454250863973764, + 0.18296781127883446, + 0.3756805469840582, + 0.354489103177839, + 0.39247840200787726 + ], + "count": [ + 58885 + ], + "q01": [ + -0.17981989779968843, + -0.5091775890358096, + -0.2823747022384176, + -2.5402944887002263, + -0.7772767335767301, + 1.5074242517409697, + -0.22118355190838992, + 0.00019929485039286372, + -0.05811890221630709, + 0.0355534456639416, + -0.014671354686920067, + -1.6166171728649017, + -0.33262037407789363, + 1.4670034437478248, + -0.20856855427653162, + 0.005969670174481924 + ], + "q10": [ + -0.12896930587932032, + -0.4363874933365298, + -0.22108213657540865, + -2.431940673529726, + -0.6917178109339138, + 1.6199445363543552, + -0.14425280902059237, + 0.0006490404603215833, + -0.00465639724748521, + 0.06950417070554113, + 0.020541581682950536, + -1.523574035591159, + -0.2990772907187631, + 1.5087846748431677, + -0.12178478728338077, + 0.006887727973015377 + ], + "q50": [ + 0.19071994481038587, + 0.0225044554622574, + -0.05771964361942917, + -1.7039336205325046, + -0.14714522872126526, + 2.208780027646876, + 0.13614323027627798, + 0.36246706129481615, + 0.03542061717828114, + 0.09949137725052416, + 0.07978819556433957, + -1.4442567729696547, + -0.008123791931469346, + 1.5639312834308705, + 0.0035313031234437474, + 0.008194616957793414 + ], + "q90": [ + 0.5617718208281102, + 0.3372213513062743, + 0.21115642563723888, + -1.5094768909216087, + 0.0005424632975009549, + 2.486585004915233, + 0.9708517327751961, + 0.3625096095467171, + 0.34023178117996594, + 0.5548909313712731, + 0.2865432489812551, + -1.3458676838483008, + 0.11952314656085365, + 2.390689009089357, + 0.7439389258839233, + 0.9444897776717005 + ], + "q99": [ + 0.5846436107029699, + 0.3679246810887602, + 0.2686922162187065, + -1.4478673737392953, + 0.0686222391930825, + 2.548678641847992, + 1.0502165134906425, + 0.3625191829033949, + 0.5200118301008778, + 0.6613513655217865, + 0.3248879353556084, + -1.2781377608963183, + 0.2509619238047111, + 2.454751758136821, + 1.0678813767059858, + 0.9451304795927765 + ] + }, + "index": { + "min": [ + 0 + ], + "max": [ + 58884 + ], + "mean": [ + 29442.0 + ], + "std": [ + 16998.635298164376 + ], + "count": [ + 58885 + ], + "q01": [ + 29212.00052687167 + ], + "q10": [ + 29254.219078618255 + ], + "q50": [ + 29441.779063900824 + ], + "q90": [ + 29629.69858367266 + ], + "q99": [ + 29671.991846398254 + ] + }, + "observation.images.camera_left": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.46247320010372533 + ] + ], + [ + [ + 0.4615624414332364 + ] + ], + [ + [ + 0.42029015289093913 + ] + ] + ], + "std": [ + [ + [ + 0.0032999404373803332 + ] + ], + [ + [ + 0.0027171969210648154 + ] + ], + [ + [ + 0.002928244417570089 + ] + ] + ], + "count": [ + 13472 + ], + "q01": [ + [ + [ + 0.054253433231095896 + ] + ], + [ + [ + 0.07110441821437478 + ] + ], + [ + [ + 0.036649910826101764 + ] + ] + ], + "q10": [ + [ + [ + 0.16653584030201984 + ] + ], + [ + [ + 0.1825803183534302 + ] + ], + [ + [ + 0.14620039681336194 + ] + ] + ], + "q50": [ + [ + [ + 0.4363331616293392 + ] + ], + [ + [ + 0.42586726431636607 + ] + ], + [ + [ + 0.35001359698550183 + ] + ] + ], + "q90": [ + [ + [ + 0.7131412248460454 + ] + ], + [ + [ + 0.7341972090885474 + ] + ], + [ + [ + 0.728190819182643 + ] + ] + ], + "q99": [ + [ + [ + 0.7596985187175224 + ] + ], + [ + [ + 0.7803990377293848 + ] + ], + [ + [ + 0.7760067743886184 + ] + ] + ] + }, + "task_index": { + "min": [ + 0 + ], + "max": [ + 0 + ], + "mean": [ + 0.0 + ], + "std": [ + 0.0 + ], + "count": [ + 58885 + ], + "q01": [ + 3.9999999999994167e-16 + ], + "q10": [ + 3.999999999999418e-15 + ], + "q50": [ + 1.9999999999997094e-14 + ], + "q90": [ + 3.5999999999994754e-14 + ], + "q99": [ + 3.9599999999994235e-14 + ] + }, + "timestamp": { + "min": [ + 0.0 + ], + "max": [ + 21.233333333333334 + ], + "mean": [ + 7.805497721547649 + ], + "std": [ + 4.755558844357094 + ], + "count": [ + 58885 + ], + "q01": [ + 0.13884861717498875 + ], + "q10": [ + 1.5461336754115809 + ], + "q50": [ + 7.798064785372518 + ], + "q90": [ + 14.062117177380546 + ], + "q99": [ + 15.471892601584731 + ] + }, + "frame_index": { + "min": [ + 0 + ], + "max": [ + 637 + ], + "mean": [ + 234.1649316464295 + ], + "std": [ + 142.66676533071268 + ], + "count": [ + 58885 + ], + "q01": [ + 4.1654585180982595 + ], + "q10": [ + 46.384010264673144 + ], + "q50": [ + 233.92680005773957 + ], + "q90": [ + 421.8635153190919 + ], + "q99": [ + 464.1567780446939 + ] + }, + "observation.states.end_effector": { + "min": [ + 0.32554587721824646, + -0.2414933294057846, + 0.3707878589630127, + -3.1415812969207764, + -0.9135556221008301, + -0.44747135043144226, + 0.44667676091194153, + -0.03310394659638405, + 0.3244316279888153, + -3.1415786743164062, + -0.669172465801239, + -0.9677415490150452 + ], + "max": [ + 0.6721678972244263, + 0.401833176612854, + 0.7120418548583984, + 3.1415834426879883, + 0.6720868349075317, + 0.5769774913787842, + 0.7523236274719238, + 0.5960171818733215, + 0.6841686964035034, + 3.1415915489196777, + 0.5525403022766113, + 1.2117595672607422 + ], + "mean": [ + 0.5194264009320351, + 0.10808882922322567, + 0.5197762573383815, + 1.7688925584930815, + -0.29953467956552515, + -0.009040712931030676, + 0.5946066851586347, + 0.1450713845479617, + 0.5655443782999531, + 0.03370911358491344, + -0.06345323249756106, + 0.15650386904123778 + ], + "std": [ + 0.06940804680395932, + 0.21479389958966238, + 0.07987378005717213, + 2.5328884145755013, + 0.37015053086730393, + 0.14033346903287636, + 0.05500649146642034, + 0.15124297160388672, + 0.09363130714884703, + 3.03814882283887, + 0.21562238267324205, + 0.34913428619424713 + ], + "count": [ + 58885 + ], + "q01": [ + 0.389680832056878, + -0.17079152837311037, + 0.39821232917555377, + -3.138355683944816, + -0.7794698389047763, + -0.2292111271434537, + 0.5201355614396355, + 0.010009717834048406, + 0.3473206492241046, + -3.1292262955378995, + -0.4931042039709613, + -0.39392433108198205 + ], + "q10": [ + 0.42069872243959483, + -0.1644398217900128, + 0.40948145367679184, + -2.609507988991038, + -0.7376621670915048, + -0.19173335222881605, + 0.5505130729366431, + 0.031160920484378817, + 0.444289028673263, + -3.0458564102249928, + -0.43339771249706854, + -0.20578126489476734 + ], + "q50": [ + 0.54287060969675, + 0.08208830090982204, + 0.5136938051063558, + 3.0231563729371027, + -0.3334930943115573, + 0.005159299527350032, + 0.5705313177680298, + 0.05576278260510965, + 0.6221501000520611, + 0.08224261688253993, + 0.0023568181324399736, + 0.057234217959009334 + ], + "q90": [ + 0.59671051597898, + 0.36814967665609416, + 0.6259625015535398, + 3.123177169155181, + 0.2582628807072582, + 0.15080460679812915, + 0.6909623712983989, + 0.36407326703235277, + 0.6393178833446403, + 2.811600900308528, + 0.1710100649519071, + 0.6242686543181142 + ], + "q99": [ + 0.6079055905141758, + 0.3797359990109224, + 0.6530560232359751, + 3.138519684929094, + 0.33900879084841173, + 0.25563831616345334, + 0.7036422698800308, + 0.5043118200846742, + 0.6496623410871386, + 3.1286655629008155, + 0.2654963193701809, + 0.7082099434004004 + ] + }, + "observation.images.camera_front": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.5100816380557235 + ] + ], + [ + [ + 0.48797377496665395 + ] + ], + [ + [ + 0.434097923507289 + ] + ] + ], + "std": [ + [ + [ + 0.005841510955391268 + ] + ], + [ + [ + 0.005956098895297586 + ] + ], + [ + [ + 0.0049887549805231225 + ] + ] + ], + "count": [ + 13472 + ], + "q01": [ + [ + [ + 0.1065357721395866 + ] + ], + [ + [ + 0.09551342896686918 + ] + ], + [ + [ + 0.07985796055323706 + ] + ] + ], + "q10": [ + [ + [ + 0.3038424725838036 + ] + ], + [ + [ + 0.262236796069868 + ] + ], + [ + [ + 0.20546917169830345 + ] + ] + ], + "q50": [ + [ + [ + 0.5127494663087449 + ] + ], + [ + [ + 0.5081266972660918 + ] + ], + [ + [ + 0.40105148618602443 + ] + ] + ], + "q90": [ + [ + [ + 0.6909139072635854 + ] + ], + [ + [ + 0.6941647590417448 + ] + ], + [ + [ + 0.6879961139373174 + ] + ] + ], + "q99": [ + [ + [ + 0.8289626568583892 + ] + ], + [ + [ + 0.8531150291535311 + ] + ], + [ + [ + 0.852308856360926 + ] + ] + ] + } +} \ No newline at end of file diff --git a/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/tasks.parquet b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/tasks.parquet new file mode 100644 index 0000000000000000000000000000000000000000..602767a0ee9648be1b068fc83f54844d4ba85ade Binary files /dev/null and b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/meta/tasks.parquet differ diff --git a/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_front/chunk-000/file-000.mp4 b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_front/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5417872eb5475e0d9bb3f66bbcd9e73733016404 --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_front/chunk-000/file-000.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1426a6fcb7267b05a233e642b13c676e98c1a8e5829c4c73338240fd126fce0 +size 151427 diff --git a/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_left/chunk-000/file-000.mp4 b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_left/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3413c980c64fc69f9f147ef1838c1bc346e48c14 --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_left/chunk-000/file-000.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d991c3eea5cfd358c57f9c6974f8c52d26d94997f584a3c9c8d9a0b9bb5b3dcd +size 240182 diff --git a/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_right/chunk-000/file-000.mp4 b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_right/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0e40a6c12ed974a08ff64376c4a15c094d251d13 --- /dev/null +++ b/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water/videos/observation.images.camera_right/chunk-000/file-000.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b029eea41d08036d02a96a0081346f5c8178a7cdd793ed8001bba6ec7dac22c +size 138128 diff --git a/assets/examples/av_v2_03292026_wdinfo/data/00000000.tar b/assets/examples/av_v2_03292026_wdinfo/data/00000000.tar new file mode 100644 index 0000000000000000000000000000000000000000..82482c3c9e15e7cb8537810a3f7afdc2dac138f2 --- /dev/null +++ b/assets/examples/av_v2_03292026_wdinfo/data/00000000.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:417bb12ff755070b342f86a117a6c31489009ee29c8404b9679d32e73c724c0d +size 30556160 diff --git a/assets/examples/av_v2_03292026_wdinfo/manifest.json b/assets/examples/av_v2_03292026_wdinfo/manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..aa81f074bf198ab92c49151d9a3ccea2a91f11f5 --- /dev/null +++ b/assets/examples/av_v2_03292026_wdinfo/manifest.json @@ -0,0 +1,12 @@ +{ + "source": "s3://nv-00-10206-robot/cosmos3_action_data/av_v2_03292026/00000000.tar", + "local_wdinfo": "wdinfo.json", + "tar": "data/00000000.tar", + "example_member": "c9dc6d50-9ae4-4f53-bcd1-149cfc5e4f55_11326911000.pkl", + "history_steps": 21, + "future_steps": 60, + "route_shape": [ + 20, + 3 + ] +} diff --git a/assets/examples/av_v2_03292026_wdinfo/wdinfo.json b/assets/examples/av_v2_03292026_wdinfo/wdinfo.json new file mode 100644 index 0000000000000000000000000000000000000000..5cddbe1b12274dcda31a9477e3e24b8f5160b9f7 --- /dev/null +++ b/assets/examples/av_v2_03292026_wdinfo/wdinfo.json @@ -0,0 +1,11 @@ +{ + "data_keys": [ + "pkl" + ], + "chunk_size": 10, + "root": "data", + "data_list": [ + "00000000.tar" + ], + "total_key_count": 10 +} diff --git a/assets/examples/bridge_lerobot_v3/README.md b/assets/examples/bridge_lerobot_v3/README.md new file mode 100644 index 0000000000000000000000000000000000000000..81dc6e5e06c723cd5a25e585e27a246c0da78acd --- /dev/null +++ b/assets/examples/bridge_lerobot_v3/README.md @@ -0,0 +1,191 @@ +--- +pretty_name: Bridge LeRobot v3 +library_name: lerobot +tags: +- robotics +- robot-learning +- vision-language-action +- bridgedata +- bridge +- lerobot +- LeRobotDataset-v3 +license: openmdw-1.0 +--- + +# Bridge LeRobot v3 + +## Dataset Summary + +`nvidia/bridge_lerobot_v3` is a LeRobotDataset v3.0 conversion of the BridgeDataset / BridgeData V2 robot manipulation dataset. BridgeData V2 is a large-scale real-world robotics dataset collected to support scalable robot learning, including imitation learning, offline reinforcement learning, and open-vocabulary multi-task policies conditioned on goal images or natural-language instructions. + +This repository packages Bridge trajectories in the LeRobot v3 layout with Parquet state/action data, MP4 video observations, and structured LeRobot metadata. The converted repository's `meta/info.json` reports 50,415 episodes, 1,801,090 frames, 22,199 task IDs, and a 5 Hz sampling rate. + +## Dataset Details + +### Dataset Description + +BridgeData V2 contains robot manipulation behaviors collected across varied tabletop and toy-kitchen environments. The upstream dataset was designed to study generalization across tasks, objects, environments, institutions, and conditioning modes such as language instructions and goal images. + +This LeRobot v3 conversion preserves the dataset for workflows that use the LeRobot data loader and Hugging Face Hub-native robotics dataset conventions. The conversion changes the storage layout: frame-level tabular data is stored in Apache Parquet shards, visual observations are stored as MP4 videos, and metadata records the schema, feature statistics, task IDs, episode boundaries, and path templates. + +### Dataset Sources + +- Repository: https://huggingface.co/datasets/nvidia/bridge_lerobot_v3 +- BridgeData V2 project page: https://rail-berkeley.github.io/bridgedata/ +- BridgeData V2 paper: https://arxiv.org/abs/2308.12952 +- LeRobotDataset v3.0 documentation: https://huggingface.co/docs/lerobot/lerobot-dataset-v3 + +## Uses + +### Direct Use + +This dataset is intended for research and development in: + +- robot imitation learning +- offline reinforcement learning +- vision-language-action model training +- goal-conditioned and language-conditioned robot policy learning +- multi-camera visuomotor policy learning +- evaluation of LeRobot-compatible training, streaming, and data-loading workflows + +### Out-of-Scope Use + +This dataset is not intended to be used as the sole validation source for safety-critical robot deployment. Policies trained on this data should be evaluated in the target environment, with appropriate robot safety controls, before any physical deployment. + +## Dataset Structure + +The repository follows the LeRobotDataset v3.0 layout: + +```text +. ++-- data/ +| `-- chunk-000/ +| `-- file-*.parquet ++-- meta/ +| +-- info.json +| +-- stats.json +| +-- tasks.parquet +| `-- episodes/ +| `-- chunk-000/ +| `-- file-*.parquet +`-- videos/ + `-- observation.images./ + `-- chunk-000/ + `-- file-*.mp4 +``` + +### Repository Metadata + +The following values are taken from the repository's `meta/info.json`. + +| Field | Value | +|---|---:| +| LeRobot codebase version | `v3.0` | +| Episodes | 50,415 | +| Frames | 1,801,090 | +| Task IDs | 22,199 | +| Split | `train: 0:50415` | +| FPS | 5 | +| Average episode length | 35.7 frames, about 7.1 seconds | +| Robot type in metadata | `null` | + +The upstream BridgeData V2 project describes the source data as collected on a WidowX 250 6DOF robot arm at 5 Hz. The converted repository does not populate `robot_type` in `meta/info.json`. + +### Features + +The following feature schema is declared in `meta/info.json`. + +| Feature | Type | Shape / Details | +|---|---|---| +| `observation.images.image_0` | video | RGB, 480 x 640, AV1 MP4, 5 FPS, no audio | +| `observation.images.image_1` | video | RGB, 480 x 640, AV1 MP4, 5 FPS, no audio | +| `observation.images.image_2` | video | RGB, 480 x 640, AV1 MP4, 5 FPS, no audio | +| `observation.images.image_3` | video | RGB, 480 x 640, AV1 MP4, 5 FPS, no audio | +| `observation.state` | `float32` | 7-dimensional robot state | +| `action` | `float32` | 7-dimensional robot action | +| `timestamp` | `float32` | Frame timestamp | +| `frame_index` | `int64` | Frame index within episode | +| `episode_index` | `int64` | Episode index | +| `task_index` | `int64` | Task ID | +| `index` | `int64` | Global frame index | + +### File Format + +- Frame-level state/action data: Apache Parquet under `data/`. +- Episode metadata: chunked Parquet under `meta/episodes/`. +- Task metadata: `meta/tasks.parquet`. +- Dataset schema and statistics: `meta/info.json` and `meta/stats.json`. +- Video observations: AV1-encoded MP4 files under `videos/`. + +## Loading + +Authenticate with Hugging Face if required, then download or stream the dataset using LeRobot-compatible tooling. + +```python +from huggingface_hub import snapshot_download + +repo_dir = snapshot_download( + repo_id="nvidia/bridge_lerobot_v3", + repo_type="dataset", + token=True, +) + +print(repo_dir) +``` + +Example LeRobot usage: + +```python +from lerobot.datasets.lerobot_dataset import LeRobotDataset + +dataset = LeRobotDataset("nvidia/bridge_lerobot_v3") +print(dataset.num_episodes) +print(dataset.meta.info["features"].keys()) +``` + +## Dataset Creation + +### Source Data + +The source dataset is BridgeData V2, a real-world robot manipulation dataset with diverse tasks, objects, camera poses, and environments. The BridgeData V2 project page reports 60,096 trajectories across 24 environments and 13 skills, including teleoperated demonstrations and scripted pick-and-place rollouts. + +This converted repository reports 50,415 episodes in `meta/info.json`; users should treat the repository metadata as authoritative for this LeRobot package and consult upstream BridgeData V2 documentation for the full source dataset description. + +### Conversion + +This repository converts Bridge data into LeRobotDataset v3.0. In v3, multiple episodes can be concatenated into larger Parquet and MP4 shard files, while metadata is used to recover per-episode boundaries and feature statistics. This layout reduces file-system pressure and supports Hub-native loading and streaming workflows. + +## Bias, Risks, and Limitations + +- The dataset reflects the embodiment, control frequency, camera setup, task distribution, and environments of the source BridgeData V2 collection. +- The upstream data is concentrated around tabletop and toy-kitchen manipulation tasks; policies trained only on this dataset may not generalize to other robots, objects, lighting, homes, labs, or industrial settings. +- Natural-language task annotations are inherited from the source/conversion pipeline and should be inspected before task-specific filtering or evaluation. +- The repository metadata reports a converted package with fewer episodes than the full upstream BridgeData V2 trajectory count. +- This dataset should not be used as the sole basis for validating safe real-world robot behavior. + +## Personal and Sensitive Information + +BridgeData V2 is robot manipulation data collected in controlled robotics environments. It is not intended to contain personal or sensitive information. Because the source uses real camera imagery, users should still inspect samples for their own release, redistribution, or downstream demo requirements. + +## Citation + +If you use this dataset, please cite the original BridgeData V2 paper: + +```bibtex +@inproceedings{walke2023bridgedata, + title={BridgeData V2: A Dataset for Robot Learning at Scale}, + author={Walke, Homer and Black, Kevin and Lee, Abraham and Kim, Moo Jin and Du, Max and Zheng, Chongyi and Zhao, Tony and Hansen-Estruch, Philippe and Vuong, Quan and He, Andre and Myers, Vivek and Fang, Kuan and Finn, Chelsea and Levine, Sergey}, + booktitle={Conference on Robot Learning (CoRL)}, + year={2023} +} +``` + +Please also cite or reference LeRobot if you use LeRobot tooling, dataset loaders, or streaming interfaces. + +## References + +- BridgeData V2 project page: https://rail-berkeley.github.io/bridgedata/ +- BridgeData V2 arXiv paper: https://arxiv.org/abs/2308.12952 +- BridgeData V2 Hugging Face paper page: https://huggingface.co/papers/2308.12952 +- LeRobotDataset v3.0 blog: https://huggingface.co/blog/lerobot-datasets-v3 +- LeRobotDataset v3.0 documentation: https://huggingface.co/docs/lerobot/lerobot-dataset-v3 \ No newline at end of file diff --git a/assets/examples/bridge_lerobot_v3/data/chunk-000/file-000.parquet b/assets/examples/bridge_lerobot_v3/data/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7e4315748d35e213d69d1996c1f3188611dd9711 Binary files /dev/null and b/assets/examples/bridge_lerobot_v3/data/chunk-000/file-000.parquet differ diff --git a/assets/examples/bridge_lerobot_v3/manifest.json b/assets/examples/bridge_lerobot_v3/manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..2deba670fd7bf40d41622002a4e962520897ce38 --- /dev/null +++ b/assets/examples/bridge_lerobot_v3/manifest.json @@ -0,0 +1,11 @@ +{ + "source_repo": "nvidia/bridge_lerobot_v3", + "episodes": [ + 0, + 1 + ], + "video_files": [ + "videos/observation.images.image_0/chunk-000/file-000.mp4" + ], + "notes": "Local two-episode Bridge demo subset; no HF runtime video streaming required." +} diff --git a/assets/examples/bridge_lerobot_v3/meta/episodes/chunk-000/file-000.parquet b/assets/examples/bridge_lerobot_v3/meta/episodes/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ada70e27595f411dfcc8088b72fd565e7bb8bd09 --- /dev/null +++ b/assets/examples/bridge_lerobot_v3/meta/episodes/chunk-000/file-000.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d65cf968ca396078256a56da1802e558ff4597f8d0d0e9c413bcb24e9027f6e +size 123838 diff --git a/assets/examples/bridge_lerobot_v3/meta/info.json b/assets/examples/bridge_lerobot_v3/meta/info.json new file mode 100644 index 0000000000000000000000000000000000000000..1be195f5b1b3321ac28984187034edc3b82bb04e --- /dev/null +++ b/assets/examples/bridge_lerobot_v3/meta/info.json @@ -0,0 +1,94 @@ +{ + "codebase_version": "v3.0", + "robot_type": null, + "total_episodes": 2, + "total_frames": 46, + "total_tasks": 1, + "chunks_size": 1000, + "data_files_size_in_mb": 100, + "video_files_size_in_mb": 200, + "fps": 5, + "splits": { + "train": "0:2" + }, + "data_path": "data/chunk-{chunk_index:03d}/file-{file_index:03d}.parquet", + "video_path": "videos/{video_key}/chunk-{chunk_index:03d}/file-{file_index:03d}.mp4", + "features": { + "observation.images.image_0": { + "dtype": "video", + "shape": [ + 480, + 640, + 3 + ], + "names": [ + "height", + "width", + "rgb" + ], + "info": { + "video.height": 480, + "video.width": 640, + "video.codec": "av1", + "video.pix_fmt": "yuv420p", + "video.is_depth_map": false, + "video.fps": 5, + "video.channels": 3, + "has_audio": false + } + }, + "observation.state": { + "dtype": "float32", + "shape": [ + 7 + ], + "names": [ + "state" + ] + }, + "action": { + "dtype": "float32", + "shape": [ + 7 + ], + "names": [ + "action" + ] + }, + "timestamp": { + "dtype": "float32", + "shape": [ + 1 + ], + "names": null + }, + "frame_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "episode_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "task_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + } + } +} diff --git a/assets/examples/bridge_lerobot_v3/meta/stats.json b/assets/examples/bridge_lerobot_v3/meta/stats.json new file mode 100644 index 0000000000000000000000000000000000000000..74046d8bc62b7dc32e6f875410f1f638ce7fd342 --- /dev/null +++ b/assets/examples/bridge_lerobot_v3/meta/stats.json @@ -0,0 +1,966 @@ +{ + "observation.state": { + "min": [ + -0.04167502000927925, + -0.3945816159248352, + -0.15537554025650024, + -3.141592502593994, + -1.4992541074752808, + -3.14153790473938, + 0.04637829214334488 + ], + "max": [ + 0.5862360596656799, + 0.4034728705883026, + 0.36494991183280945, + 1.514088749885559, + 1.570796251296997, + 3.1415255069732666, + 1.1154625415802002 + ], + "mean": [ + 0.3078657070409933, + 0.031129854114640153, + 0.06524272890027896, + 0.010478528822645234, + -0.1153295264240988, + 0.13293259596689477, + 0.7535287298011476 + ], + "std": [ + 0.06390337703414369, + 0.09462637224137227, + 0.053973478626083184, + 0.14631237221965346, + 0.17009178266906624, + 0.6086491420893041, + 0.3311070209320066 + ], + "count": [ + 1801090 + ], + "q01": [ + 0.2463635235491362, + -0.05778863449398043, + 0.016231048745205447, + -0.1276350840652153, + -0.2652197166812523, + -0.17831831795983122, + 0.4352433491416346 + ], + "q10": [ + 0.2558413467940049, + -0.048030725569368646, + 0.021219741154135583, + -0.09136181372777585, + -0.2250875629406645, + -0.12693452700587574, + 0.4425213950537474 + ], + "q50": [ + 0.30547148123085394, + 0.028478191953509382, + 0.061788129019053534, + 0.009242004648470028, + -0.11450417784936784, + 0.12738886565246837, + 0.8525009665260582 + ], + "q90": [ + 0.3599411511270558, + 0.11057157999679051, + 0.11297343197112365, + 0.11136320995120631, + -0.00892123863592227, + 0.3888200032384959, + 0.9973646300570351 + ], + "q99": [ + 0.36770364361385066, + 0.12096997648229695, + 0.12923741839066463, + 0.14542505776055412, + 0.022217630370894997, + 0.43883905381625504, + 1.0043319804665372 + ] + }, + "episode_index": { + "min": [ + 0 + ], + "max": [ + 395 + ], + "mean": [ + 17.263674774719753 + ], + "std": [ + 28.83284715650058 + ], + "count": [ + 1801090 + ], + "q01": [ + 17.263674774719753 + ], + "q10": [ + 17.263674774719753 + ], + "q50": [ + 17.26367477471977 + ], + "q90": [ + 17.263674774719785 + ], + "q99": [ + 17.26367477471979 + ] + }, + "frame_index": { + "min": [ + 0 + ], + "max": [ + 118 + ], + "mean": [ + 19.619974015734915 + ], + "std": [ + 14.763794098949749 + ], + "count": [ + 1801090 + ], + "q01": [ + 0.00030421277553703693 + ], + "q10": [ + 3.4375358736604062 + ], + "q50": [ + 19.329761038813803 + ], + "q90": [ + 35.73429024692172 + ], + "q99": [ + 39.23594070484433 + ] + }, + "action": { + "min": [ + -0.21280786395072937, + -0.22161394357681274, + -0.4670022130012512, + -0.9053626656532288, + -0.728705883026123, + -1.1610136032104492, + 0.0 + ], + "max": [ + 0.14468924701213837, + 0.28896045684814453, + 0.25066766142845154, + 0.9110258221626282, + 0.6760650873184204, + 1.1013237237930298, + 1.0 + ], + "mean": [ + 0.00057356759712255, + 0.00011228985106677395, + 0.00033733821111343496, + -3.2933660066906675e-05, + -0.0018181216694647497, + 0.00010204879316899695, + 0.6233483145627287 + ], + "std": [ + 0.010469277762356712, + 0.01524215121861673, + 0.012821488448696097, + 0.031125604404310805, + 0.032741672740958225, + 0.055620604208622804, + 0.48136517793996136 + ], + "count": [ + 1801090 + ], + "q01": [ + -0.021358451629461966, + -0.029393954179487956, + -0.020992353446725114, + -0.06808332768846173, + -0.07911264819669223, + -0.11372979514314813, + 0.14180975403225513 + ], + "q10": [ + -0.011676061926034044, + -0.017996865470775638, + -0.013541170337680411, + -0.033728621240137514, + -0.041577432476653686, + -0.05835590433099303, + 0.14237743007083528 + ], + "q50": [ + 0.000300881419541358, + -2.5015201984531362e-05, + -0.001233054505179989, + -0.0005056874720216848, + -0.0004731729701556107, + 0.0003401666623676183, + 0.8210043724514866 + ], + "q90": [ + 0.01291656058204435, + 0.01787281343314976, + 0.018174764709341165, + 0.033431397688145396, + 0.033405877348579645, + 0.0560117568652242, + 0.9913997376187794 + ], + "q99": [ + 0.02139908763176426, + 0.029476266857249304, + 0.03265355802175534, + 0.07228712693697929, + 0.06867303567220237, + 0.10840311474445433, + 0.9999937720417067 + ] + }, + "observation.images.image_1": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.23808484892300974 + ] + ], + [ + [ + 0.21420300786234783 + ] + ], + [ + [ + 0.20480729820984775 + ] + ] + ], + "std": [ + [ + [ + 0.24000894290405092 + ] + ], + [ + [ + 0.21610636481161952 + ] + ], + [ + [ + 0.20811969692647148 + ] + ] + ], + "count": [ + 1801038 + ], + "q01": [ + [ + [ + 0.017376900679127255 + ] + ], + [ + [ + 0.01816247306892097 + ] + ], + [ + [ + 0.022169398589928356 + ] + ] + ], + "q10": [ + [ + [ + 0.058939146797035866 + ] + ], + [ + [ + 0.054181793322751755 + ] + ], + [ + [ + 0.05851348748686418 + ] + ] + ], + "q50": [ + [ + [ + 0.24390417350821517 + ] + ], + [ + [ + 0.21150810916563514 + ] + ], + [ + [ + 0.19402058424434335 + ] + ] + ], + "q90": [ + [ + [ + 0.40179968313407505 + ] + ], + [ + [ + 0.37710226136613284 + ] + ], + [ + [ + 0.37200396696903076 + ] + ] + ], + "q99": [ + [ + [ + 0.45799038689270954 + ] + ], + [ + [ + 0.4420613852265376 + ] + ], + [ + [ + 0.44403614462942814 + ] + ] + ] + }, + "timestamp": { + "min": [ + 0.0 + ], + "max": [ + 23.6 + ], + "mean": [ + 3.923994803146983 + ], + "std": [ + 2.9527588197899495 + ], + "count": [ + 1801090 + ], + "q01": [ + 6.0842475107852296e-05 + ], + "q10": [ + 0.6875071746659088 + ], + "q50": [ + 3.866315450153611 + ], + "q90": [ + 7.146858049450175 + ], + "q99": [ + 7.847188141048853 + ] + }, + "task_index": { + "min": [ + 0 + ], + "max": [ + 364 + ], + "mean": [ + 9.508057343053373 + ], + "std": [ + 25.584243677940766 + ], + "count": [ + 1801090 + ], + "q01": [ + 9.508057343053373 + ], + "q10": [ + 9.508057343053375 + ], + "q50": [ + 9.508057343053395 + ], + "q90": [ + 9.508057343053409 + ], + "q99": [ + 9.508057343053412 + ] + }, + "index": { + "min": [ + 0 + ], + "max": [ + 13495 + ], + "mean": [ + 683.7963394388953 + ], + "std": [ + 1011.3259831719954 + ], + "count": [ + 1801090 + ], + "q01": [ + 664.1766696359362 + ], + "q10": [ + 667.6139012968208 + ], + "q50": [ + 683.5079981450107 + ], + "q90": [ + 699.9106556700821 + ], + "q99": [ + 703.4123061280047 + ] + }, + "observation.images.image_0": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.4927899446119918 + ] + ], + [ + [ + 0.43718849912642893 + ] + ], + [ + [ + 0.4049071781276338 + ] + ] + ], + "std": [ + [ + [ + 0.04903922015781917 + ] + ], + [ + [ + 0.054039313949008884 + ] + ], + [ + [ + 0.06398983917342628 + ] + ] + ], + "count": [ + 1801038 + ], + "q01": [ + [ + [ + 0.028015927559851838 + ] + ], + [ + [ + 0.03397041537724499 + ] + ], + [ + [ + 0.034433446895848485 + ] + ] + ], + "q10": [ + [ + [ + 0.11761248327446759 + ] + ], + [ + [ + 0.12180146354404779 + ] + ], + [ + [ + 0.11710207961092006 + ] + ] + ], + "q50": [ + [ + [ + 0.5229947438553765 + ] + ], + [ + [ + 0.44429792190932454 + ] + ], + [ + [ + 0.3921117613148599 + ] + ] + ], + "q90": [ + [ + [ + 0.8008342295803912 + ] + ], + [ + [ + 0.7247795469245689 + ] + ], + [ + [ + 0.7064622418840562 + ] + ] + ], + "q99": [ + [ + [ + 0.9310260844205789 + ] + ], + [ + [ + 0.8931819628452381 + ] + ], + [ + [ + 0.8822519621921744 + ] + ] + ] + }, + "observation.images.image_2": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.22753174550311242 + ] + ], + [ + [ + 0.20561787066183748 + ] + ], + [ + [ + 0.19600823597907133 + ] + ] + ], + "std": [ + [ + [ + 0.2303201611903527 + ] + ], + [ + [ + 0.20878867606799872 + ] + ], + [ + [ + 0.20155566396380575 + ] + ] + ], + "count": [ + 1801038 + ], + "q01": [ + [ + [ + 0.01211383757424793 + ] + ], + [ + [ + 0.012276410483772641 + ] + ], + [ + [ + 0.01321032164222202 + ] + ] + ], + "q10": [ + [ + [ + 0.05121966583649511 + ] + ], + [ + [ + 0.04728960396760485 + ] + ], + [ + [ + 0.0463226160539576 + ] + ] + ], + "q50": [ + [ + [ + 0.2372462692701267 + ] + ], + [ + [ + 0.20519831597920832 + ] + ], + [ + [ + 0.18687017333168612 + ] + ] + ], + "q90": [ + [ + [ + 0.38424357423382227 + ] + ], + [ + [ + 0.36282325690794803 + ] + ], + [ + [ + 0.35931688603448336 + ] + ] + ], + "q99": [ + [ + [ + 0.44992452922757115 + ] + ], + [ + [ + 0.43853143316629245 + ] + ], + [ + [ + 0.4431074246926838 + ] + ] + ] + }, + "observation.images.image_3": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.04973473547975949 + ] + ], + [ + [ + 0.041110336188520565 + ] + ], + [ + [ + 0.03450967238737552 + ] + ] + ], + "std": [ + [ + [ + 0.1369734922735789 + ] + ], + [ + [ + 0.11269590804336074 + ] + ], + [ + [ + 0.09838749201534187 + ] + ] + ], + "count": [ + 1801038 + ], + "q01": [ + [ + [ + 0.0012642076543331999 + ] + ], + [ + [ + 0.0029803378055088445 + ] + ], + [ + [ + 0.0008574280984943751 + ] + ] + ], + "q10": [ + [ + [ + 0.006922960656197909 + ] + ], + [ + [ + 0.00766833310799447 + ] + ], + [ + [ + 0.004724069784713882 + ] + ] + ], + "q50": [ + [ + [ + 0.051436814470232954 + ] + ], + [ + [ + 0.04120058475184378 + ] + ], + [ + [ + 0.03273733587483956 + ] + ] + ], + "q90": [ + [ + [ + 0.08785494596489245 + ] + ], + [ + [ + 0.07315302760390548 + ] + ], + [ + [ + 0.06678022662441209 + ] + ] + ], + "q99": [ + [ + [ + 0.10623778579211848 + ] + ], + [ + [ + 0.09674034260551785 + ] + ], + [ + [ + 0.09280312347382377 + ] + ] + ] + } +} \ No newline at end of file diff --git a/assets/examples/bridge_lerobot_v3/meta/tasks.parquet b/assets/examples/bridge_lerobot_v3/meta/tasks.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a872c0b521e80e9c255e78f1bd05cc740ade7e40 Binary files /dev/null and b/assets/examples/bridge_lerobot_v3/meta/tasks.parquet differ diff --git a/assets/examples/bridge_lerobot_v3/videos/observation.images.image_0/chunk-000/file-000.mp4 b/assets/examples/bridge_lerobot_v3/videos/observation.images.image_0/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..9930e268df26c27ccdb8437d5abcfcbc4fa725f6 --- /dev/null +++ b/assets/examples/bridge_lerobot_v3/videos/observation.images.image_0/chunk-000/file-000.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8c90ded6c64a9999f8d79e986a8817617342b1a97db0178ccff5533a7334386 +size 171313 diff --git a/assets/examples/droid_plus_lerobot_640x360_20260412/success/data/chunk-000/file-000.parquet b/assets/examples/droid_plus_lerobot_640x360_20260412/success/data/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..50b64ce771c22f89a579ff0f4fcc9c6b314be1d4 --- /dev/null +++ b/assets/examples/droid_plus_lerobot_640x360_20260412/success/data/chunk-000/file-000.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8726a2a7eafaa6351d61b1f7ce114f764e65d6c212a5350f9a2894f9289c01dc +size 168056 diff --git a/assets/examples/droid_plus_lerobot_640x360_20260412/success/manifest.json b/assets/examples/droid_plus_lerobot_640x360_20260412/success/manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..aeadc1f1e5d5f2fd34d22b6dcdc6fc747a5112ae --- /dev/null +++ b/assets/examples/droid_plus_lerobot_640x360_20260412/success/manifest.json @@ -0,0 +1,21 @@ +{ + "source_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/droid_plus_lerobot_640x360_20260412/success", + "source_episode": 0, + "local_episode": 0, + "source_index_range": [ + 0, + 471 + ], + "frames": 471, + "source_task_indices": [ + 0 + ], + "tasks": [ + "Pour the contents of the yellow cup into the bowl | Pour the contents of the yellow cup into the pink bowl | Pour the contents of the yellow cup into the bowl" + ], + "videos": [ + "videos/observation.image.wrist_image_left/chunk-000/file-000.mp4", + "videos/observation.image.exterior_image_1_left/chunk-000/file-000.mp4", + "videos/observation.image.exterior_image_2_left/chunk-000/file-000.mp4" + ] +} diff --git a/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/episodes/chunk-000/file-000.parquet b/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/episodes/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b3292ec2263d7497c3f5f7f4457e3bfd84d7b085 --- /dev/null +++ b/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/episodes/chunk-000/file-000.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:473efcf65c4ce7af0afb0e54e304db7d6b79bccbf8d31b2b4e014d992316fe92 +size 212825 diff --git a/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/info.json b/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/info.json new file mode 100644 index 0000000000000000000000000000000000000000..a88521957fb467a761baad9c24dbcc453587a96b --- /dev/null +++ b/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/info.json @@ -0,0 +1,179 @@ +{ + "codebase_version": "v3.0", + "robot_type": "panda", + "total_episodes": 1, + "total_frames": 471, + "total_tasks": 1, + "chunks_size": 1000, + "data_files_size_in_mb": 100, + "video_files_size_in_mb": 200, + "fps": 15, + "splits": { + "train": "0:1" + }, + "data_path": "data/chunk-{chunk_index:03d}/file-{file_index:03d}.parquet", + "video_path": "videos/{video_key}/chunk-{chunk_index:03d}/file-{file_index:03d}.mp4", + "features": { + "observation.image.exterior_image_1_left": { + "dtype": "video", + "shape": [ + 360, + 640, + 3 + ], + "info": { + "video.height": 360, + "video.width": 640, + "video.codec": "av1", + "video.pix_fmt": "yuv420p", + "video.is_depth_map": false, + "video.fps": 15, + "video.channels": 3, + "has_audio": false + } + }, + "observation.image.exterior_image_2_left": { + "dtype": "video", + "shape": [ + 360, + 640, + 3 + ], + "info": { + "video.height": 360, + "video.width": 640, + "video.codec": "av1", + "video.pix_fmt": "yuv420p", + "video.is_depth_map": false, + "video.fps": 15, + "video.channels": 3, + "has_audio": false + } + }, + "observation.image.wrist_image_left": { + "dtype": "video", + "shape": [ + 360, + 640, + 3 + ], + "info": { + "video.height": 360, + "video.width": 640, + "video.codec": "av1", + "video.pix_fmt": "yuv420p", + "video.is_depth_map": false, + "video.fps": 15, + "video.channels": 3, + "has_audio": false + } + }, + "observation.state.joint_positions": { + "dtype": "float32", + "shape": [ + 7 + ] + }, + "observation.state.joint_velocities": { + "dtype": "float32", + "shape": [ + 7 + ] + }, + "observation.state.joint_torques_computed": { + "dtype": "float32", + "shape": [ + 7 + ] + }, + "observation.state.motor_torques_measured": { + "dtype": "float32", + "shape": [ + 7 + ] + }, + "observation.state.cartesian_position": { + "dtype": "float32", + "shape": [ + 6 + ] + }, + "observation.state.gripper_position": { + "dtype": "float32", + "shape": [ + 1 + ] + }, + "action.joint_position": { + "dtype": "float32", + "shape": [ + 7 + ] + }, + "action.joint_velocity": { + "dtype": "float32", + "shape": [ + 7 + ] + }, + "action.cartesian_position": { + "dtype": "float32", + "shape": [ + 6 + ] + }, + "action.cartesian_velocity": { + "dtype": "float32", + "shape": [ + 6 + ] + }, + "action.gripper_position": { + "dtype": "float32", + "shape": [ + 1 + ] + }, + "action.gripper_velocity": { + "dtype": "float32", + "shape": [ + 1 + ] + }, + "timestamp": { + "dtype": "float32", + "shape": [ + 1 + ], + "names": null + }, + "frame_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "episode_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "task_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + } + } +} diff --git a/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/stats.json b/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/stats.json new file mode 100644 index 0000000000000000000000000000000000000000..479c0c016836ea9125e971944c08620f88f081b1 --- /dev/null +++ b/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/stats.json @@ -0,0 +1,1479 @@ +{ + "observation.state.cartesian_position": { + "min": [ + -0.22192564606666565, + -0.8099977970123291, + -0.24001094698905945, + -3.141592502593994, + -1.5703768730163574, + -3.14157772064209 + ], + "max": [ + 0.8575563430786133, + 0.8196876049041748, + 1.0111403465270996, + 3.1415927410125732, + 1.5684677362442017, + 3.1415927410125732 + ], + "mean": [ + 0.5196280976981249, + 0.0009853788756085775, + 0.32100291244511253, + 0.3187035548659042, + -0.09489115560178896, + -0.03782510194329433 + ], + "std": [ + 0.11966269381655047, + 0.1840951537741764, + 0.1660031327090243, + 2.9119070203353057, + 0.360529050389239, + 0.8159590967825275 + ], + "count": [ + 18691281 + ], + "q01": [ + 0.33624007223059466, + -0.18976536914494324, + 0.18393308966044905, + -2.7955397235198975, + -0.4950343184080683, + -0.8581685139966064 + ], + "q10": [ + 0.4017154337495113, + -0.14017804428073918, + 0.20761156135127543, + -2.3101865748721733, + -0.3693534979187698, + -0.6230976434378469 + ], + "q50": [ + 0.5273264945852854, + 0.0008513312644672381, + 0.30696488716482756, + 0.4898543102775791, + -0.09435368128802955, + -0.0375054693853198 + ], + "q90": [ + 0.6203302057398535, + 0.14186639768892967, + 0.460628229890261, + 2.625476859924513, + 0.18064562575296228, + 0.5474490313234763 + ], + "q99": [ + 0.6482050048288891, + 0.1945358777163571, + 0.5423836319291491, + 2.9444948977740983, + 0.3144336584654846, + 0.785962951659008 + ] + }, + "observation.image.exterior_image_2_left": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.39698857248908037 + ] + ], + [ + [ + 0.3937475306292987 + ] + ], + [ + [ + 0.35555214316674105 + ] + ] + ], + "std": [ + [ + [ + 0.061672021015474934 + ] + ], + [ + [ + 0.06194416401814127 + ] + ], + [ + [ + 0.07588382954177514 + ] + ] + ], + "count": [ + 6215443 + ], + "q01": [ + [ + [ + 0.012687200764395137 + ] + ], + [ + [ + 0.015487369328871509 + ] + ], + [ + [ + 0.0076571378838845875 + ] + ] + ], + "q10": [ + [ + [ + 0.08111056896217507 + ] + ], + [ + [ + 0.08284075636323286 + ] + ], + [ + [ + 0.06473367564152509 + ] + ] + ], + "q50": [ + [ + [ + 0.36832910582414286 + ] + ], + [ + [ + 0.3589757621010206 + ] + ], + [ + [ + 0.3108992482705161 + ] + ] + ], + "q90": [ + [ + [ + 0.7630344426080897 + ] + ], + [ + [ + 0.7623061364967888 + ] + ], + [ + [ + 0.7211189014908148 + ] + ] + ], + "q99": [ + [ + [ + 0.9261070922126955 + ] + ], + [ + [ + 0.940775088875296 + ] + ], + [ + [ + 0.9276931167688498 + ] + ] + ] + }, + "observation.image.wrist_image_left": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.39594309896648566 + ] + ], + [ + [ + 0.3836015788399831 + ] + ], + [ + [ + 0.32883697366271997 + ] + ] + ], + "std": [ + [ + [ + 0.08820112155555676 + ] + ], + [ + [ + 0.07983439022468844 + ] + ], + [ + [ + 0.09559483574519906 + ] + ] + ], + "count": [ + 6215443 + ], + "q01": [ + [ + [ + 0.0014113945006799314 + ] + ], + [ + [ + 0.00179868849642481 + ] + ], + [ + [ + 0.0015209398260128302 + ] + ] + ], + "q10": [ + [ + [ + 0.03525048252678373 + ] + ], + [ + [ + 0.035695954052051275 + ] + ], + [ + [ + 0.02578849513959022 + ] + ] + ], + "q50": [ + [ + [ + 0.40446845320818914 + ] + ], + [ + [ + 0.3797765310467597 + ] + ], + [ + [ + 0.30599089191797074 + ] + ] + ], + "q90": [ + [ + [ + 0.7497542348288407 + ] + ], + [ + [ + 0.7403067883982201 + ] + ], + [ + [ + 0.6716269838882366 + ] + ] + ], + "q99": [ + [ + [ + 0.9066662690066858 + ] + ], + [ + [ + 0.912195599992383 + ] + ], + [ + [ + 0.8757491027084656 + ] + ] + ] + }, + "observation.state.joint_velocities": { + "min": [ + -1.2325929403305054, + -1.1620159149169922, + -1.2910404205322266, + -1.1445194482803345, + -1.4311167001724243, + -1.8945889472961426, + -1.8736029863357544 + ], + "max": [ + 1.2808489799499512, + 1.1894278526306152, + 1.2651691436767578, + 1.0335216522216797, + 1.5209081172943115, + 1.4298126697540283, + 1.411964774131775 + ], + "mean": [ + 0.0007668096011207866, + 0.02928915365171192, + -0.0008063308920928154, + 0.016429417973647484, + -0.0015627907447299401, + 0.015991573555231696, + 0.0032758998234802173 + ], + "std": [ + 0.13515408974183327, + 0.2140052912241772, + 0.13556597382102004, + 0.18811877819383568, + 0.1867162168430174, + 0.21122018690081104, + 0.21438512383397482 + ], + "count": [ + 18691281 + ], + "q01": [ + -0.34629224272983733, + -0.4723289554009115, + -0.34824012860091785, + -0.45490254978797284, + -0.4973926538063903, + -0.5160135572878216, + -0.5645107826901703 + ], + "q10": [ + -0.11734571052638743, + -0.21926666450125706, + -0.116485136067324, + -0.18684185966035827, + -0.17172595693671794, + -0.18244989566737269, + -0.20481833845700168 + ], + "q50": [ + 0.00022609332018521527, + 0.01429435363721013, + -0.00022322329741366394, + 0.007950081860874253, + -0.0002005335566592241, + 0.0009510218348768809, + 0.0009504778402937328 + ], + "q90": [ + 0.11891803503242757, + 0.2756342521512834, + 0.11379391555507393, + 0.220554289191697, + 0.16409735451468954, + 0.24523778353038594, + 0.21529678507501568 + ], + "q99": [ + 0.34147168118681376, + 0.5228120580985741, + 0.3516862162535885, + 0.43597715991320773, + 0.4956068911281174, + 0.5959691983814939, + 0.5489515881695498 + ] + }, + "observation.state.gripper_position": { + "min": [ + 0.0 + ], + "max": [ + 1.0 + ], + "mean": [ + 0.3778461710802597 + ], + "std": [ + 0.40871666834322623 + ], + "count": [ + 18691281 + ], + "q01": [ + 3.773488006398992e-05 + ], + "q10": [ + 0.010282849615100587 + ], + "q50": [ + 0.3296462467631541 + ], + "q90": [ + 0.7898515831023881 + ], + "q99": [ + 0.8234262665263168 + ] + }, + "task_index": { + "min": [ + 0 + ], + "max": [ + 17193 + ], + "mean": [ + 3306.827166046032 + ], + "std": [ + 4013.6426122382063 + ], + "count": [ + 18691281 + ], + "q01": [ + 3306.827166046032 + ], + "q10": [ + 3306.827166046032 + ], + "q50": [ + 3306.827166046032 + ], + "q90": [ + 3306.8271660460323 + ], + "q99": [ + 3306.8271660460323 + ] + }, + "action.joint_position": { + "min": [ + -2.781099557876587, + -1.647990107536316, + -2.772181749343872, + -2.9508564472198486, + -2.7826988697052, + 0.17518290877342224, + -2.901715040206909 + ], + "max": [ + 2.751160144805908, + 1.6689813137054443, + 2.769918203353882, + -0.21164260804653168, + 2.781451463699341, + 4.402013778686523, + 2.90183162689209 + ], + "mean": [ + 0.013199600302419347, + 0.24892700740181023, + -0.01293894075853087, + -2.018944157148896, + -0.04106824765025123, + 2.342938950502355, + 0.0623517051785903 + ], + "std": [ + 0.35347317324885785, + 0.5065376421473036, + 0.31662624358528346, + 0.5059683232860253, + 0.5498160977741493, + 0.47372670446560544, + 0.7925113426977689 + ], + "count": [ + 18691281 + ], + "q01": [ + -0.36044010782074737, + -0.6135316974597613, + -0.385181357782481, + -2.5957739609803463, + -0.6027203309453739, + 1.6828812378015896, + -0.6693693583677074 + ], + "q10": [ + -0.23459411447816506, + -0.2627565082526972, + -0.23944974213612572, + -2.4457957966659634, + -0.4243207013452144, + 1.9273194422573392, + -0.4622766051982292 + ], + "q50": [ + 0.009292829825423342, + 0.3014828460271428, + -0.011381819469281053, + -2.0281789913112007, + -0.03687058046149728, + 2.3593059403388486, + 0.06060105749585953 + ], + "q90": [ + 0.26578901397123483, + 0.658817845502543, + 0.21073156426598516, + -1.5823832781172136, + 0.3334117743292958, + 2.7236675646162176, + 0.5875851356970185 + ], + "q99": [ + 0.40981370808200224, + 0.7767221193137215, + 0.347007058174135, + -1.4263037393422948, + 0.5015067841267811, + 2.8618039285988606, + 0.801324502776132 + ] + }, + "action.gripper_position": { + "min": [ + 0.0 + ], + "max": [ + 1.0 + ], + "mean": [ + 0.418297932839273 + ], + "std": [ + 0.4425579088701078 + ], + "count": [ + 18691281 + ], + "q01": [ + 0.00019078585392472987 + ], + "q10": [ + 0.010803643046073442 + ], + "q50": [ + 0.35138024039898136 + ], + "q90": [ + 0.8835301288793914 + ], + "q99": [ + 0.9048730067515455 + ] + }, + "observation.state.motor_torques_measured": { + "min": [ + -25.94832420349121, + -87.83673858642578, + -44.32796096801758, + -16.94495391845703, + -12.420953750610352, + -13.297513008117676, + -6.907970428466797 + ], + "max": [ + 26.754270553588867, + 64.0811538696289, + 50.10732650756836, + 44.767879486083984, + 12.776189804077148, + 15.945478439331055, + 7.04659366607666 + ], + "mean": [ + -0.09045047201312431, + -34.29667075739162, + -0.1185023310644514, + 20.94341909516319, + 0.5291787647890451, + 2.2707021502402833, + 0.013084365002168205 + ], + "std": [ + 1.148625708486322, + 14.784722430062418, + 4.243052099757267, + 4.132280668526913, + 0.7285612021114827, + 1.1053276390348445, + 0.2584544041576275 + ], + "count": [ + 18691281 + ], + "q01": [ + -2.175235539369352, + -49.377647248011534, + -5.826520769201629, + 15.149840095313714, + -0.6673080586285703, + 0.5264886278859034, + -0.3662641382413548 + ], + "q10": [ + -0.9983434446865149, + -45.986515490260174, + -3.458525153816551, + 17.381327260157597, + -0.22691314403367593, + 1.2250490392882252, + -0.25615728114593317 + ], + "q50": [ + -0.0860637464904619, + -36.16392208425017, + -0.08149806138187557, + 21.153030167693032, + 0.5418284713112982, + 2.345815182745681, + 0.019068463717365012 + ], + "q90": [ + 0.8025424046639105, + -18.79532275126766, + 3.138285404521963, + 24.165852188119906, + 1.2618411713323243, + 3.180750154588862, + 0.2733030996939559 + ], + "q99": [ + 1.9839008890842533, + -8.84785670251801, + 5.410357341149311, + 25.550521752397493, + 1.6461261861240115, + 3.6552713771874608, + 0.3841125363687029 + ] + }, + "observation.state.joint_positions": { + "min": [ + -2.664970874786377, + -1.6156227588653564, + -2.680800676345825, + -2.9055848121643066, + -2.6705946922302246, + 0.24893812835216522, + -2.761866807937622 + ], + "max": [ + 2.6687583923339844, + 1.5840554237365723, + 2.6957037448883057, + -0.31610623002052307, + 2.6621792316436768, + 4.28157901763916, + 2.755643367767334 + ], + "mean": [ + 0.013791949786537994, + 0.24427557830341348, + -0.01318882777251663, + -2.026164415286467, + -0.0411291928767819, + 2.341849359205603, + 0.06125256477512789 + ], + "std": [ + 0.3502433042554258, + 0.5105651212980146, + 0.31516304468241213, + 0.5018439625812248, + 0.5442562422655662, + 0.4701331203619676, + 0.7865961326524101 + ], + "count": [ + 18691281 + ], + "q01": [ + -0.35221861487232076, + -0.6151462496261719, + -0.3804700767197909, + -2.5923478694921736, + -0.5854883979097063, + 1.6903565635957598, + -0.6544480001387185 + ], + "q10": [ + -0.23285924999802607, + -0.29010091029711904, + -0.24030863436683778, + -2.457366233444359, + -0.41975775565802553, + 1.9203418059194497, + -0.4623098906029416 + ], + "q50": [ + 0.009929714705934432, + 0.30511215625383054, + -0.01189937068433103, + -2.0319206082102745, + -0.03707056123462397, + 2.361159521249432, + 0.05984592351049138 + ], + "q90": [ + 0.26522281891194416, + 0.6558676369030764, + 0.21141054864193545, + -1.5921684408574053, + 0.3291723967052105, + 2.7202117083474326, + 0.5851309623636093 + ], + "q99": [ + 0.40319962215851335, + 0.7619416500985984, + 0.34202418876395896, + -1.447520622332787, + 0.48483662716734677, + 2.8473429714186076, + 0.7846589029264255 + ] + }, + "index": { + "min": [ + 0 + ], + "max": [ + 5272516 + ], + "mean": [ + 1310808.4007145364 + ], + "std": [ + 1287934.685416825 + ], + "count": [ + 18691281 + ], + "q01": [ + 1310329.4218393876 + ], + "q10": [ + 1310417.362258262 + ], + "q50": [ + 1310808.2273377946 + ], + "q90": [ + 1311199.3589162764 + ], + "q99": [ + 1311287.371586794 + ] + }, + "frame_index": { + "min": [ + 0 + ], + "max": [ + 34362 + ], + "mean": [ + 488.24993787210207 + ], + "std": [ + 1513.8511839233154 + ], + "count": [ + 18691281 + ], + "q01": [ + 9.271062717800033 + ], + "q10": [ + 97.2114816012389 + ], + "q50": [ + 488.0572185069863 + ], + "q90": [ + 879.2007030672643 + ], + "q99": [ + 967.2198291484269 + ] + }, + "timestamp": { + "min": [ + 0.0 + ], + "max": [ + 2290.8 + ], + "mean": [ + 32.54999585814025 + ], + "std": [ + 100.92341226155439 + ], + "count": [ + 18691281 + ], + "q01": [ + 0.6180708477615872 + ], + "q10": [ + 6.480765440007698 + ], + "q50": [ + 32.53720037807416 + ], + "q90": [ + 58.61338020455933 + ], + "q99": [ + 64.48132194332018 + ] + }, + "action.gripper_velocity": { + "min": [ + -1.0 + ], + "max": [ + 1.0 + ], + "mean": [ + 0.15989621464462586 + ], + "std": [ + 0.4462997670679963 + ], + "count": [ + 18691281 + ], + "q01": [ + -0.9094265350632038 + ], + "q10": [ + -0.10910863026177213 + ], + "q50": [ + 0.09722019469380247 + ], + "q90": [ + 0.5471264388836043 + ], + "q99": [ + 0.9430653164581466 + ] + }, + "episode_index": { + "min": [ + 0 + ], + "max": [ + 18101 + ], + "mean": [ + 4006.2060856075004 + ], + "std": [ + 4159.4654856685065 + ], + "count": [ + 18691281 + ], + "q01": [ + 4006.2060856075004 + ], + "q10": [ + 4006.2060856075004 + ], + "q50": [ + 4006.2060856075004 + ], + "q90": [ + 4006.2060856075004 + ], + "q99": [ + 4006.2060856075004 + ] + }, + "observation.image.exterior_image_1_left": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.3935929122066613 + ] + ], + [ + [ + 0.3885335475030728 + ] + ], + [ + [ + 0.3515364706827202 + ] + ] + ], + "std": [ + [ + [ + 0.06012316665802346 + ] + ], + [ + [ + 0.061141097206828955 + ] + ], + [ + [ + 0.07413341557853112 + ] + ] + ], + "count": [ + 6215443 + ], + "q01": [ + [ + [ + 0.012953224073805667 + ] + ], + [ + [ + 0.015152780827787433 + ] + ], + [ + [ + 0.00861166638453724 + ] + ] + ], + "q10": [ + [ + [ + 0.0837700067775019 + ] + ], + [ + [ + 0.08480807433136518 + ] + ], + [ + [ + 0.06764227065612026 + ] + ] + ], + "q50": [ + [ + [ + 0.3558359164636593 + ] + ], + [ + [ + 0.34596424611402693 + ] + ], + [ + [ + 0.3007029392584655 + ] + ] + ], + "q90": [ + [ + [ + 0.7677290626835452 + ] + ], + [ + [ + 0.7620642782848648 + ] + ], + [ + [ + 0.7224838451115285 + ] + ] + ], + "q99": [ + [ + [ + 0.9249874509600575 + ] + ], + [ + [ + 0.9375189969046649 + ] + ], + [ + [ + 0.9227459073014876 + ] + ] + ] + }, + "observation.state.joint_torques_computed": { + "min": [ + -26.771318435668945, + -36.63962936401367, + -26.350133895874023, + -22.278278350830078, + -12.59711742401123, + -9.927318572998047, + -4.382072925567627 + ], + "max": [ + 32.96823501586914, + 35.30547332763672, + 26.816661834716797, + 23.059751510620117, + 12.070505142211914, + 12.110977172851562, + 5.327917575836182 + ], + "mean": [ + -0.08512072409867738, + -0.9065801664123562, + -0.029602630556266114, + 0.6316930784312421, + 0.028903000698396318, + 0.08252977585512208, + 0.01828691355405112 + ], + "std": [ + 1.340606388276152, + 2.159151088702341, + 1.2868917133427562, + 1.6442633846848522, + 0.8135979162429936, + 0.7703853679967682, + 0.382176611925247 + ], + "count": [ + 18691281 + ], + "q01": [ + -2.7177552872546067, + -4.904378796313158, + -2.5081315091150618, + -2.5720827384983997, + -1.4192874231095336, + -1.3427410438961, + -0.6555645736902543 + ], + "q10": [ + -1.310899968811493, + -2.958179523656676, + -1.1391459525021743, + -0.9565310957849424, + -0.9177959289601971, + -0.7609331206774717, + -0.4078647257442235 + ], + "q50": [ + -0.07505141962627339, + -0.8578361638526507, + -0.036068823539492326, + 0.7534932454207656, + 0.023865668207664566, + 0.11707451898045382, + 0.02569371284470846 + ], + "q90": [ + 1.117567076531323, + 1.0702269684319032, + 1.0826402570669635, + 2.032942384706255, + 0.9798880304673434, + 0.8709803494219956, + 0.4328364981780825 + ], + "q99": [ + 2.523812308049357, + 3.3266416108309964, + 2.4765324795080836, + 3.063502020941035, + 1.4644210715130928, + 1.419444457342673, + 0.6832162237967178 + ] + }, + "action.cartesian_velocity": { + "min": [ + -0.9999999403953552, + -0.9999951124191284, + -0.9999960660934448, + -0.9999980330467224, + -0.9999969601631165, + -0.9999998807907104 + ], + "max": [ + 0.9999998211860657, + 0.9999836087226868, + 0.9999973177909851, + 0.9999697208404541, + 0.9999954104423523, + 0.9999998807907104 + ], + "mean": [ + 0.022781586037321677, + -0.002642982246250131, + 0.019422232008431437, + 0.00385024695714588, + -0.030741170113786385, + -0.007221580743078788 + ], + "std": [ + 0.26331051677940037, + 0.1940336807939164, + 0.2320077278617576, + 0.2277295186145981, + 0.23565237235944003, + 0.30079523233833466 + ], + "count": [ + 18691281 + ], + "q01": [ + -0.5621750592318072, + -0.44066103347818825, + -0.410103508551534, + -0.5074656798841014, + -0.560176623050948, + -0.6746580861109687 + ], + "q10": [ + -0.2941247006847527, + -0.2165219671216752, + -0.22198569556668202, + -0.2539858711678539, + -0.3023348467162342, + -0.3539325707069309 + ], + "q50": [ + 0.03167925093387433, + -0.0005856818028497108, + -0.004049419762073512, + 0.0013363499102087182, + -0.024444778635266458, + -0.007232429448076757 + ], + "q90": [ + 0.30944639589367245, + 0.20765986892294552, + 0.3142771570510021, + 0.2638894712537269, + 0.23493938070144402, + 0.33997530700517126 + ], + "q99": [ + 0.5366988428269169, + 0.4283949455663026, + 0.577160869138631, + 0.5241492686975527, + 0.49812821195535767, + 0.6743231345822654 + ] + }, + "action.joint_velocity": { + "min": [ + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0 + ], + "max": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0 + ], + "mean": [ + -0.0032264550712801845, + 0.013372541292925158, + 0.0015260425148475995, + 0.030577660060776682, + 0.000853369159829645, + -0.00030791045007205444, + 0.004324216188539675 + ], + "std": [ + 0.15738005716197367, + 0.29789431140641043, + 0.15267329933136972, + 0.2951536063397572, + 0.21853175444547254, + 0.23759831123238903, + 0.2589575509825406 + ], + "count": [ + 18691281 + ], + "q01": [ + -0.3720953716095578, + -0.6646666966410545, + -0.35537280317155545, + -0.6572392059107256, + -0.5079467301310688, + -0.535227394370355, + -0.6062588965469352 + ], + "q10": [ + -0.16553817347572433, + -0.3784961819192349, + -0.15030858713347564, + -0.3333324914423694, + -0.24241123224105304, + -0.27220791101727304, + -0.2930082969980739 + ], + "q50": [ + -0.0023710144415663072, + 0.031479031390672, + 0.0008277421317488518, + 0.039739343152067036, + 0.0011234450995665334, + -0.00572930901929865, + 0.0057179642953776874 + ], + "q90": [ + 0.1575076632205645, + 0.35038722521032933, + 0.15392283710015325, + 0.35867501730587226, + 0.24204228729362678, + 0.28206719744755376, + 0.2992755165736914 + ], + "q99": [ + 0.3584453109141214, + 0.6165426582109276, + 0.3650701572623211, + 0.596820860066475, + 0.5120463717033935, + 0.5652294579719821, + 0.5868785460202175 + ] + }, + "action.cartesian_position": { + "min": [ + -0.22296355664730072, + -0.8682119250297546, + -0.3079752027988434, + -3.141592502593994, + -1.570475697517395, + -3.1415903568267822 + ], + "max": [ + 0.9279670119285583, + 0.8648782968521118, + 1.074978232383728, + 3.1415927410125732, + 1.5702463388442993, + 3.1415891647338867 + ], + "mean": [ + 0.5217509054840234, + 0.0007883269900035715, + 0.3221104303446512, + 0.30714984136747414, + -0.09803336407639632, + -0.039177217145903054 + ], + "std": [ + 0.12071560629176353, + 0.18503476174786404, + 0.1666573340143361, + 2.9100072713264864, + 0.3633258858590838, + 0.8221708234450337 + ], + "count": [ + 18691281 + ], + "q01": [ + 0.3356816895292364, + -0.19133798527538426, + 0.1801999563371533, + -2.8323461535362444, + -0.5039531064948487, + -0.8696942196827485 + ], + "q10": [ + 0.40801413986337737, + -0.1395284239119436, + 0.2075567910938741, + -2.3230738137233806, + -0.3747761213529591, + -0.6251931005070784 + ], + "q50": [ + 0.5274814798968698, + 0.000745909687968284, + 0.31017238004165865, + 0.47620942748795536, + -0.09728429713123418, + -0.038704636643419714 + ], + "q90": [ + 0.6235519831628223, + 0.14058381794916908, + 0.4583311912346178, + 2.6281921722430055, + 0.1786755079522444, + 0.5456597094824605 + ], + "q99": [ + 0.6542487361445086, + 0.19523494988290416, + 0.5434926045583094, + 2.9609058856482395, + 0.31791269359900076, + 0.7944463373545845 + ] + } +} \ No newline at end of file diff --git a/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/tasks.parquet b/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/tasks.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b9db7f91ff31d0e9b9dacae6dba2a8503c9ccf83 Binary files /dev/null and b/assets/examples/droid_plus_lerobot_640x360_20260412/success/meta/tasks.parquet differ diff --git a/assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.exterior_image_1_left/chunk-000/file-000.mp4 b/assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.exterior_image_1_left/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..136bac686e849f34cde3c14291badbaa11e284b2 --- /dev/null +++ b/assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.exterior_image_1_left/chunk-000/file-000.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe8cd6fa9e095b557fb4cdf092899ebc28b9653ba8f9b29a745c71e5ae3e48b0 +size 269998 diff --git a/assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.exterior_image_2_left/chunk-000/file-000.mp4 b/assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.exterior_image_2_left/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7037933f3ae8f4ab89d738a137c3f8be014d04f2 --- /dev/null +++ b/assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.exterior_image_2_left/chunk-000/file-000.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd7a52d61833996792f27150193a7c6cdefb201f3e2ee83f14415dbbc612a32 +size 200842 diff --git a/assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.wrist_image_left/chunk-000/file-000.mp4 b/assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.wrist_image_left/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..48f4763bb3d9e0850dbcede957af037e4825ccca --- /dev/null +++ b/assets/examples/droid_plus_lerobot_640x360_20260412/success/videos/observation.image.wrist_image_left/chunk-000/file-000.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15ac775bcec935c07955c2aaa41c3834c3c846e2b1bc91bf3e0b9b7282733930 +size 316099 diff --git a/assets/examples/fastumi/fastumi_single_arm/pour_coke/data/chunk-000/file-000.parquet b/assets/examples/fastumi/fastumi_single_arm/pour_coke/data/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6a61fa5cc6aea815ef7564f8d054b74ba46e499b Binary files /dev/null and b/assets/examples/fastumi/fastumi_single_arm/pour_coke/data/chunk-000/file-000.parquet differ diff --git a/assets/examples/fastumi/fastumi_single_arm/pour_coke/manifest.json b/assets/examples/fastumi/fastumi_single_arm/pour_coke/manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..eccff5fcc18e4030b13da7ce6a10e93476df8bd7 --- /dev/null +++ b/assets/examples/fastumi/fastumi_single_arm/pour_coke/manifest.json @@ -0,0 +1,8 @@ +{ + "source_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/fastumi/fastumi_single_arm/pour_coke", + "source_episode": 0, + "local_episode": 0, + "frames": 180, + "task": "pour_coke", + "video": "videos/observation.image.right_main_camera_rgb/chunk-000/file-000.mp4" +} diff --git a/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/episodes/chunk-000/file-000.parquet b/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/episodes/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b5a9030ca61cbd0e6a212fc0e03e42bb11207fca Binary files /dev/null and b/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/episodes/chunk-000/file-000.parquet differ diff --git a/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/info.json b/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/info.json new file mode 100644 index 0000000000000000000000000000000000000000..70c21f1ea0a1f7c080a5503dcac80e83c7a890b3 --- /dev/null +++ b/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/info.json @@ -0,0 +1,113 @@ +{ + "codebase_version": "v3.0", + "robot_type": "XARM6", + "robot_setup_type": "single_arm", + "include_head_camera": false, + "include_body_camera": false, + "third_camera_num": 0, + "has_mirrors": false, + "total_episodes": 1, + "total_frames": 180, + "total_tasks": 1, + "data_files_size_in_mb": 100.0, + "video_files_size_in_mb": 200.0, + "fps": 20.0, + "splits": { + "train": "0:1" + }, + "data_path": "data/chunk-{chunk_index:03d}/file-{file_index:03d}.parquet", + "video_path": "videos/{video_key}/chunk-{chunk_index:03d}/file-{file_index:03d}.mp4", + "features": { + "timestamp": { + "dtype": "float32", + "shape": [ + 1 + ], + "names": null + }, + "frame_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "episode_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "task_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "observation.state.right_main_camera_trajectory_xyz_wxyz": { + "dtype": "float32", + "shape": [ + 7 + ], + "names": [ + "observation.state.right_main_camera_trajectory_xyz_wxyz_0", + "observation.state.right_main_camera_trajectory_xyz_wxyz_1", + "observation.state.right_main_camera_trajectory_xyz_wxyz_2", + "observation.state.right_main_camera_trajectory_xyz_wxyz_3", + "observation.state.right_main_camera_trajectory_xyz_wxyz_4", + "observation.state.right_main_camera_trajectory_xyz_wxyz_5", + "observation.state.right_main_camera_trajectory_xyz_wxyz_6" + ] + }, + "observation.state.right_gripper_width_m": { + "dtype": "float32", + "shape": [ + 1 + ], + "names": [ + "observation.state.right_gripper_width_m_0" + ] + }, + "observation.image.right_main_camera_rgb": { + "dtype": "video", + "shape": [ + 480, + 480, + 3 + ], + "names": [ + "height", + "width", + "channels" + ] + } + }, + "right_eef_pose_in_main_xyz_wxyz": [ + 0.0, + 0.08579999953508377, + 0.1454000025987625, + 1.0, + 0.0, + 0.0, + 0.0 + ], + "right_main_camera_raw_res": [ + 1080, + 1920 + ], + "right_main_camera_video_pipeline": "ffmpeg -f rawvideo -pix_fmt bgr24 -s {in_w}x{in_h} -r {fps} -i - -loop 1 -i {xpgm} -loop 1 -i {ypgm} -filter_complex '[0:v]format=yuv444p[v];[v][1:v][2:v]remap=format=color,crop={side}:{side},scale={out_w}:{out_h},format=yuv420p[out]' -map [out] -shortest -an -c:v av1_nvenc -preset p4 -cq 23 {output}", + "right_gripper_model": "FastUMI", + "right_gripper_width_m_max": [ + 0.08 + ], + "repo_id": "local/camx/fastumi/fastumi_single_arm/pour_coke" +} diff --git a/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/stats.json b/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/stats.json new file mode 100644 index 0000000000000000000000000000000000000000..2aaa8942cb39e8d4fb7171107f6d7cdbdbe85d79 --- /dev/null +++ b/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/stats.json @@ -0,0 +1,54 @@ +{ + "observation.state.right_gripper_width_m": { + "mean": [ + 0.062415022403001785 + ], + "std": [ + 0.014273382723331451 + ], + "min": [ + 0.04197647050023079 + ], + "max": [ + 0.07999999821186066 + ] + }, + "observation.state.right_main_camera_trajectory_xyz_wxyz": { + "mean": [ + 0.29477229714393616, + 0.17546765506267548, + -0.1128092110157013, + 0.14784009754657745, + 0.01722816191613674, + 0.144368514418602, + -0.025626657530665398 + ], + "std": [ + 0.16135473549365997, + 0.15001733601093292, + 0.1014970913529396, + 0.31992843747138977, + 0.6212483048439026, + 0.5223610401153564, + 0.4417937994003296 + ], + "min": [ + -0.19988380372524261, + -0.19237850606441498, + -0.40376341342926025, + -0.5746796727180481, + -0.6518781185150146, + -0.6542174816131592, + -0.5680083632469177 + ], + "max": [ + 0.5659647583961487, + 0.5889244675636292, + 0.11997988075017929, + 0.5790585875511169, + 0.8479742407798767, + 0.7905207872390747, + 0.6766778230667114 + ] + } +} \ No newline at end of file diff --git a/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/tasks.jsonl b/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/tasks.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..95e16b3970abbc53500c9664e6dc770988814217 --- /dev/null +++ b/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/tasks.jsonl @@ -0,0 +1,460 @@ +{"task_index": 0, "task": "pour_coke", "raw_video_dirs": ["/storage/hdd1/chiling/fastumi_data/raw/pour_coke_v0", "/storage/hdd1/chiling/fastumi_data/raw/pour_coke_v1", "/storage/hdd1/chiling/fastumi_data/raw/pour_coke_v2", "/storage/hdd1/chiling/fastumi_data/raw/pour_coke_v3", "/storage/hdd1/chiling/fastumi_data/raw/pour_coke_v4", "/storage/hdd1/chiling/fastumi_data/raw/pour_coke_v5", "/storage/hdd1/chiling/fastumi_data/raw/pour_coke_v6", "/storage/hdd1/chiling/fastumi_data/raw/pour_coke_v7", "/storage/hdd1/chiling/fastumi_data/raw/pour_coke_v8", "/storage/hdd1/chiling/fastumi_data/raw/pour_coke_v9"]} +{"episode_idx": 0, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_1.hdf5"} +{"episode_idx": 1, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_2.hdf5"} +{"episode_idx": 2, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_3.hdf5"} +{"episode_idx": 3, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_4.hdf5"} +{"episode_idx": 4, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_5.hdf5"} +{"episode_idx": 5, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_6.hdf5"} +{"episode_idx": 6, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_7.hdf5"} +{"episode_idx": 7, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_8.hdf5"} +{"episode_idx": 8, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_9.hdf5"} +{"episode_idx": 9, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_10.hdf5"} +{"episode_idx": 10, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_11.hdf5"} +{"episode_idx": 11, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_12.hdf5"} +{"episode_idx": 12, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_13.hdf5"} +{"episode_idx": 13, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_14.hdf5"} +{"episode_idx": 14, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_15.hdf5"} +{"episode_idx": 15, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_16.hdf5"} +{"episode_idx": 16, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_17.hdf5"} +{"episode_idx": 17, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_18.hdf5"} +{"episode_idx": 18, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_19.hdf5"} +{"episode_idx": 19, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_20.hdf5"} +{"episode_idx": 20, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_21.hdf5"} +{"episode_idx": 21, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_22.hdf5"} +{"episode_idx": 22, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_23.hdf5"} +{"episode_idx": 23, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_24.hdf5"} +{"episode_idx": 24, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_25.hdf5"} +{"episode_idx": 25, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_26.hdf5"} +{"episode_idx": 26, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_27.hdf5"} +{"episode_idx": 27, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_28.hdf5"} +{"episode_idx": 28, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_29.hdf5"} +{"episode_idx": 29, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_30.hdf5"} +{"episode_idx": 30, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_31.hdf5"} +{"episode_idx": 31, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_32.hdf5"} +{"episode_idx": 32, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_33.hdf5"} +{"episode_idx": 33, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_34.hdf5"} +{"episode_idx": 34, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_35.hdf5"} +{"episode_idx": 35, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_36.hdf5"} +{"episode_idx": 36, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_37.hdf5"} +{"episode_idx": 37, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_38.hdf5"} +{"episode_idx": 38, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_39.hdf5"} +{"episode_idx": 39, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_40.hdf5"} +{"episode_idx": 40, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_41.hdf5"} +{"episode_idx": 41, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_42.hdf5"} +{"episode_idx": 42, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_43.hdf5"} +{"episode_idx": 43, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_44.hdf5"} +{"episode_idx": 44, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_45.hdf5"} +{"episode_idx": 45, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_46.hdf5"} +{"episode_idx": 46, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_47.hdf5"} +{"episode_idx": 47, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_48.hdf5"} +{"episode_idx": 48, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_49.hdf5"} +{"episode_idx": 49, "task": "pour_coke", "raw_path": "pour_coke_v0/episode_50.hdf5"} +{"episode_idx": 50, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_1.hdf5"} +{"episode_idx": 51, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_2.hdf5"} +{"episode_idx": 52, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_3.hdf5"} +{"episode_idx": 53, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_4.hdf5"} +{"episode_idx": 54, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_5.hdf5"} +{"episode_idx": 55, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_6.hdf5"} +{"episode_idx": 56, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_7.hdf5"} +{"episode_idx": 57, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_8.hdf5"} +{"episode_idx": 58, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_9.hdf5"} +{"episode_idx": 59, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_10.hdf5"} +{"episode_idx": 60, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_11.hdf5"} +{"episode_idx": 61, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_12.hdf5"} +{"episode_idx": 62, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_13.hdf5"} +{"episode_idx": 63, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_14.hdf5"} +{"episode_idx": 64, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_15.hdf5"} +{"episode_idx": 65, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_16.hdf5"} +{"episode_idx": 66, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_17.hdf5"} +{"episode_idx": 67, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_18.hdf5"} +{"episode_idx": 68, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_19.hdf5"} +{"episode_idx": 69, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_20.hdf5"} +{"episode_idx": 70, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_21.hdf5"} +{"episode_idx": 71, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_22.hdf5"} +{"episode_idx": 72, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_23.hdf5"} +{"episode_idx": 73, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_24.hdf5"} +{"episode_idx": 74, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_25.hdf5"} +{"episode_idx": 75, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_26.hdf5"} +{"episode_idx": 76, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_27.hdf5"} +{"episode_idx": 77, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_28.hdf5"} +{"episode_idx": 78, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_29.hdf5"} +{"episode_idx": 79, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_30.hdf5"} +{"episode_idx": 80, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_31.hdf5"} +{"episode_idx": 81, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_32.hdf5"} +{"episode_idx": 82, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_33.hdf5"} +{"episode_idx": 83, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_34.hdf5"} +{"episode_idx": 84, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_35.hdf5"} +{"episode_idx": 85, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_36.hdf5"} +{"episode_idx": 86, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_37.hdf5"} +{"episode_idx": 87, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_38.hdf5"} +{"episode_idx": 88, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_39.hdf5"} +{"episode_idx": 89, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_40.hdf5"} +{"episode_idx": 90, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_41.hdf5"} +{"episode_idx": 91, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_42.hdf5"} +{"episode_idx": 92, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_43.hdf5"} +{"episode_idx": 93, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_44.hdf5"} +{"episode_idx": 94, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_45.hdf5"} +{"episode_idx": 95, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_46.hdf5"} +{"episode_idx": 96, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_47.hdf5"} +{"episode_idx": 97, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_48.hdf5"} +{"episode_idx": 98, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_49.hdf5"} +{"episode_idx": 99, "task": "pour_coke", "raw_path": "pour_coke_v1/episode_50.hdf5"} +{"episode_idx": 100, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_1.hdf5"} +{"episode_idx": 101, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_2.hdf5"} +{"episode_idx": 102, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_3.hdf5"} +{"episode_idx": 103, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_4.hdf5"} +{"episode_idx": 104, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_5.hdf5"} +{"episode_idx": 105, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_6.hdf5"} +{"episode_idx": 106, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_7.hdf5"} +{"episode_idx": 107, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_8.hdf5"} +{"episode_idx": 108, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_9.hdf5"} +{"episode_idx": 109, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_10.hdf5"} +{"episode_idx": 110, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_11.hdf5"} +{"episode_idx": 111, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_12.hdf5"} +{"episode_idx": 112, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_13.hdf5"} +{"episode_idx": 113, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_14.hdf5"} +{"episode_idx": 114, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_15.hdf5"} +{"episode_idx": 115, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_16.hdf5"} +{"episode_idx": 116, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_17.hdf5"} +{"episode_idx": 117, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_18.hdf5"} +{"episode_idx": 118, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_19.hdf5"} +{"episode_idx": 119, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_20.hdf5"} +{"episode_idx": 120, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_21.hdf5"} +{"episode_idx": 121, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_22.hdf5"} +{"episode_idx": 122, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_23.hdf5"} +{"episode_idx": 123, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_24.hdf5"} +{"episode_idx": 124, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_25.hdf5"} +{"episode_idx": 125, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_26.hdf5"} +{"episode_idx": 126, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_27.hdf5"} +{"episode_idx": 127, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_28.hdf5"} +{"episode_idx": 128, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_29.hdf5"} +{"episode_idx": 129, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_30.hdf5"} +{"episode_idx": 130, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_31.hdf5"} +{"episode_idx": 131, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_32.hdf5"} +{"episode_idx": 132, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_33.hdf5"} +{"episode_idx": 133, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_34.hdf5"} +{"episode_idx": 134, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_35.hdf5"} +{"episode_idx": 135, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_36.hdf5"} +{"episode_idx": 136, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_37.hdf5"} +{"episode_idx": 137, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_38.hdf5"} +{"episode_idx": 138, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_39.hdf5"} +{"episode_idx": 139, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_40.hdf5"} +{"episode_idx": 140, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_41.hdf5"} +{"episode_idx": 141, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_42.hdf5"} +{"episode_idx": 142, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_43.hdf5"} +{"episode_idx": 143, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_44.hdf5"} +{"episode_idx": 144, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_45.hdf5"} +{"episode_idx": 145, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_46.hdf5"} +{"episode_idx": 146, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_47.hdf5"} +{"episode_idx": 147, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_48.hdf5"} +{"episode_idx": 148, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_49.hdf5"} +{"episode_idx": 149, "task": "pour_coke", "raw_path": "pour_coke_v2/episode_50.hdf5"} +{"episode_idx": 150, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_1.hdf5"} +{"episode_idx": 151, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_2.hdf5"} +{"episode_idx": 152, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_3.hdf5"} +{"episode_idx": 153, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_4.hdf5"} +{"episode_idx": 154, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_5.hdf5"} +{"episode_idx": 155, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_6.hdf5"} +{"episode_idx": 156, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_7.hdf5"} +{"episode_idx": 157, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_8.hdf5"} +{"episode_idx": 158, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_9.hdf5"} +{"episode_idx": 159, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_10.hdf5"} +{"episode_idx": 160, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_11.hdf5"} +{"episode_idx": 161, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_12.hdf5"} +{"episode_idx": 162, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_13.hdf5"} +{"episode_idx": 163, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_14.hdf5"} +{"episode_idx": 164, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_15.hdf5"} +{"episode_idx": 165, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_16.hdf5"} +{"episode_idx": 166, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_17.hdf5"} +{"episode_idx": 167, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_18.hdf5"} +{"episode_idx": 168, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_19.hdf5"} +{"episode_idx": 169, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_20.hdf5"} +{"episode_idx": 170, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_21.hdf5"} +{"episode_idx": 171, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_22.hdf5"} +{"episode_idx": 172, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_23.hdf5"} +{"episode_idx": 173, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_24.hdf5"} +{"episode_idx": 174, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_25.hdf5"} +{"episode_idx": 175, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_26.hdf5"} +{"episode_idx": 176, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_27.hdf5"} +{"episode_idx": 177, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_28.hdf5"} +{"episode_idx": 178, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_29.hdf5"} +{"episode_idx": 179, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_30.hdf5"} +{"episode_idx": 180, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_31.hdf5"} +{"episode_idx": 181, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_32.hdf5"} +{"episode_idx": 182, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_33.hdf5"} +{"episode_idx": 183, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_34.hdf5"} +{"episode_idx": 184, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_35.hdf5"} +{"episode_idx": 185, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_36.hdf5"} +{"episode_idx": 186, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_37.hdf5"} +{"episode_idx": 187, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_38.hdf5"} +{"episode_idx": 188, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_39.hdf5"} +{"episode_idx": 189, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_40.hdf5"} +{"episode_idx": 190, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_41.hdf5"} +{"episode_idx": 191, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_42.hdf5"} +{"episode_idx": 192, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_43.hdf5"} +{"episode_idx": 193, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_44.hdf5"} +{"episode_idx": 194, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_45.hdf5"} +{"episode_idx": 195, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_46.hdf5"} +{"episode_idx": 196, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_47.hdf5"} +{"episode_idx": 197, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_48.hdf5"} +{"episode_idx": 198, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_49.hdf5"} +{"episode_idx": 199, "task": "pour_coke", "raw_path": "pour_coke_v3/episode_50.hdf5"} +{"episode_idx": 200, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_1.hdf5"} +{"episode_idx": 201, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_2.hdf5"} +{"episode_idx": 202, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_3.hdf5"} +{"episode_idx": 203, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_4.hdf5"} +{"episode_idx": 204, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_5.hdf5"} +{"episode_idx": 205, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_6.hdf5"} +{"episode_idx": 206, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_7.hdf5"} +{"episode_idx": 207, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_8.hdf5"} +{"episode_idx": 208, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_9.hdf5"} +{"episode_idx": 209, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_10.hdf5"} +{"episode_idx": 210, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_11.hdf5"} +{"episode_idx": 211, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_12.hdf5"} +{"episode_idx": 212, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_13.hdf5"} +{"episode_idx": 213, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_14.hdf5"} +{"episode_idx": 214, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_15.hdf5"} +{"episode_idx": 215, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_16.hdf5"} +{"episode_idx": 216, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_17.hdf5"} +{"episode_idx": 217, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_18.hdf5"} +{"episode_idx": 218, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_19.hdf5"} +{"episode_idx": 219, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_20.hdf5"} +{"episode_idx": 220, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_21.hdf5"} +{"episode_idx": 221, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_22.hdf5"} +{"episode_idx": 222, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_23.hdf5"} +{"episode_idx": 223, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_24.hdf5"} +{"episode_idx": 224, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_25.hdf5"} +{"episode_idx": 225, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_26.hdf5"} +{"episode_idx": 226, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_27.hdf5"} +{"episode_idx": 227, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_28.hdf5"} +{"episode_idx": 228, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_29.hdf5"} +{"episode_idx": 229, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_30.hdf5"} +{"episode_idx": 230, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_31.hdf5"} +{"episode_idx": 231, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_32.hdf5"} +{"episode_idx": 232, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_33.hdf5"} +{"episode_idx": 233, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_34.hdf5"} +{"episode_idx": 234, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_35.hdf5"} +{"episode_idx": 235, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_36.hdf5"} +{"episode_idx": 236, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_37.hdf5"} +{"episode_idx": 237, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_38.hdf5"} +{"episode_idx": 238, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_39.hdf5"} +{"episode_idx": 239, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_40.hdf5"} +{"episode_idx": 240, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_41.hdf5"} +{"episode_idx": 241, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_42.hdf5"} +{"episode_idx": 242, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_43.hdf5"} +{"episode_idx": 243, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_44.hdf5"} +{"episode_idx": 244, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_45.hdf5"} +{"episode_idx": 245, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_46.hdf5"} +{"episode_idx": 246, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_47.hdf5"} +{"episode_idx": 247, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_48.hdf5"} +{"episode_idx": 248, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_49.hdf5"} +{"episode_idx": 249, "task": "pour_coke", "raw_path": "pour_coke_v4/episode_50.hdf5"} +{"episode_idx": 250, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_1.hdf5"} +{"episode_idx": 251, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_2.hdf5"} +{"episode_idx": 252, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_3.hdf5"} +{"episode_idx": 253, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_4.hdf5"} +{"episode_idx": 254, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_5.hdf5"} +{"episode_idx": 255, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_6.hdf5"} +{"episode_idx": 256, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_7.hdf5"} +{"episode_idx": 257, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_8.hdf5"} +{"episode_idx": 258, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_9.hdf5"} +{"episode_idx": 259, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_10.hdf5"} +{"episode_idx": 260, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_11.hdf5"} +{"episode_idx": 261, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_12.hdf5"} +{"episode_idx": 262, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_13.hdf5"} +{"episode_idx": 263, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_14.hdf5"} +{"episode_idx": 264, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_15.hdf5"} +{"episode_idx": 265, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_16.hdf5"} +{"episode_idx": 266, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_17.hdf5"} +{"episode_idx": 267, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_18.hdf5"} +{"episode_idx": 268, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_19.hdf5"} +{"episode_idx": 269, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_20.hdf5"} +{"episode_idx": 270, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_21.hdf5"} +{"episode_idx": 271, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_22.hdf5"} +{"episode_idx": 272, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_23.hdf5"} +{"episode_idx": 273, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_24.hdf5"} +{"episode_idx": 274, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_25.hdf5"} +{"episode_idx": 275, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_26.hdf5"} +{"episode_idx": 276, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_27.hdf5"} +{"episode_idx": 277, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_28.hdf5"} +{"episode_idx": 278, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_29.hdf5"} +{"episode_idx": 279, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_30.hdf5"} +{"episode_idx": 280, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_31.hdf5"} +{"episode_idx": 281, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_32.hdf5"} +{"episode_idx": 282, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_33.hdf5"} +{"episode_idx": 283, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_34.hdf5"} +{"episode_idx": 284, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_35.hdf5"} +{"episode_idx": 285, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_36.hdf5"} +{"episode_idx": 286, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_37.hdf5"} +{"episode_idx": 287, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_38.hdf5"} +{"episode_idx": 288, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_39.hdf5"} +{"episode_idx": 289, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_40.hdf5"} +{"episode_idx": 290, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_41.hdf5"} +{"episode_idx": 291, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_42.hdf5"} +{"episode_idx": 292, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_43.hdf5"} +{"episode_idx": 293, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_44.hdf5"} +{"episode_idx": 294, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_45.hdf5"} +{"episode_idx": 295, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_46.hdf5"} +{"episode_idx": 296, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_47.hdf5"} +{"episode_idx": 297, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_48.hdf5"} +{"episode_idx": 298, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_49.hdf5"} +{"episode_idx": 299, "task": "pour_coke", "raw_path": "pour_coke_v5/episode_50.hdf5"} +{"episode_idx": 300, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_1.hdf5"} +{"episode_idx": 301, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_2.hdf5"} +{"episode_idx": 302, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_3.hdf5"} +{"episode_idx": 303, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_4.hdf5"} +{"episode_idx": 304, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_5.hdf5"} +{"episode_idx": 305, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_6.hdf5"} +{"episode_idx": 306, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_7.hdf5"} +{"episode_idx": 307, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_8.hdf5"} +{"episode_idx": 308, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_9.hdf5"} +{"episode_idx": 309, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_10.hdf5"} +{"episode_idx": 310, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_11.hdf5"} +{"episode_idx": 311, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_12.hdf5"} +{"episode_idx": 312, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_13.hdf5"} +{"episode_idx": 313, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_14.hdf5"} +{"episode_idx": 314, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_15.hdf5"} +{"episode_idx": 315, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_16.hdf5"} +{"episode_idx": 316, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_17.hdf5"} +{"episode_idx": 317, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_18.hdf5"} +{"episode_idx": 318, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_19.hdf5"} +{"episode_idx": 319, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_20.hdf5"} +{"episode_idx": 320, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_21.hdf5"} +{"episode_idx": 321, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_22.hdf5"} +{"episode_idx": 322, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_23.hdf5"} +{"episode_idx": 323, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_24.hdf5"} +{"episode_idx": 324, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_25.hdf5"} +{"episode_idx": 325, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_26.hdf5"} +{"episode_idx": 326, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_27.hdf5"} +{"episode_idx": 327, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_28.hdf5"} +{"episode_idx": 328, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_29.hdf5"} +{"episode_idx": 329, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_30.hdf5"} +{"episode_idx": 330, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_31.hdf5"} +{"episode_idx": 331, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_32.hdf5"} +{"episode_idx": 332, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_33.hdf5"} +{"episode_idx": 333, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_34.hdf5"} +{"episode_idx": 334, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_35.hdf5"} +{"episode_idx": 335, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_36.hdf5"} +{"episode_idx": 336, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_37.hdf5"} +{"episode_idx": 337, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_38.hdf5"} +{"episode_idx": 338, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_39.hdf5"} +{"episode_idx": 339, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_40.hdf5"} +{"episode_idx": 340, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_41.hdf5"} +{"episode_idx": 341, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_42.hdf5"} +{"episode_idx": 342, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_43.hdf5"} +{"episode_idx": 343, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_44.hdf5"} +{"episode_idx": 344, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_45.hdf5"} +{"episode_idx": 345, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_46.hdf5"} +{"episode_idx": 346, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_47.hdf5"} +{"episode_idx": 347, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_48.hdf5"} +{"episode_idx": 348, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_49.hdf5"} +{"episode_idx": 349, "task": "pour_coke", "raw_path": "pour_coke_v6/episode_50.hdf5"} +{"episode_idx": 350, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_1.hdf5"} +{"episode_idx": 351, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_2.hdf5"} +{"episode_idx": 352, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_3.hdf5"} +{"episode_idx": 353, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_4.hdf5"} +{"episode_idx": 354, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_5.hdf5"} +{"episode_idx": 355, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_6.hdf5"} +{"episode_idx": 356, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_7.hdf5"} +{"episode_idx": 357, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_8.hdf5"} +{"episode_idx": 358, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_9.hdf5"} +{"episode_idx": 359, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_10.hdf5"} +{"episode_idx": 360, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_11.hdf5"} +{"episode_idx": 361, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_12.hdf5"} +{"episode_idx": 362, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_13.hdf5"} +{"episode_idx": 363, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_14.hdf5"} +{"episode_idx": 364, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_15.hdf5"} +{"episode_idx": 365, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_16.hdf5"} +{"episode_idx": 366, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_17.hdf5"} +{"episode_idx": 367, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_18.hdf5"} +{"episode_idx": 368, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_19.hdf5"} +{"episode_idx": 369, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_20.hdf5"} +{"episode_idx": 370, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_21.hdf5"} +{"episode_idx": 371, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_22.hdf5"} +{"episode_idx": 372, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_23.hdf5"} +{"episode_idx": 373, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_24.hdf5"} +{"episode_idx": 374, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_25.hdf5"} +{"episode_idx": 375, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_26.hdf5"} +{"episode_idx": 376, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_27.hdf5"} +{"episode_idx": 377, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_28.hdf5"} +{"episode_idx": 378, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_29.hdf5"} +{"episode_idx": 379, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_30.hdf5"} +{"episode_idx": 380, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_31.hdf5"} +{"episode_idx": 381, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_32.hdf5"} +{"episode_idx": 382, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_33.hdf5"} +{"episode_idx": 383, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_34.hdf5"} +{"episode_idx": 384, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_35.hdf5"} +{"episode_idx": 385, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_36.hdf5"} +{"episode_idx": 386, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_37.hdf5"} +{"episode_idx": 387, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_38.hdf5"} +{"episode_idx": 388, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_39.hdf5"} +{"episode_idx": 389, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_40.hdf5"} +{"episode_idx": 390, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_41.hdf5"} +{"episode_idx": 391, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_42.hdf5"} +{"episode_idx": 392, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_43.hdf5"} +{"episode_idx": 393, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_44.hdf5"} +{"episode_idx": 394, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_45.hdf5"} +{"episode_idx": 395, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_46.hdf5"} +{"episode_idx": 396, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_47.hdf5"} +{"episode_idx": 397, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_48.hdf5"} +{"episode_idx": 398, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_49.hdf5"} +{"episode_idx": 399, "task": "pour_coke", "raw_path": "pour_coke_v7/episode_50.hdf5"} +{"episode_idx": 400, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_1.hdf5"} +{"episode_idx": 401, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_2.hdf5"} +{"episode_idx": 402, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_3.hdf5"} +{"episode_idx": 403, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_4.hdf5"} +{"episode_idx": 404, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_5.hdf5"} +{"episode_idx": 405, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_6.hdf5"} +{"episode_idx": 406, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_7.hdf5"} +{"episode_idx": 407, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_8.hdf5"} +{"episode_idx": 408, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_9.hdf5"} +{"episode_idx": 409, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_10.hdf5"} +{"episode_idx": 410, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_11.hdf5"} +{"episode_idx": 411, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_12.hdf5"} +{"episode_idx": 412, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_13.hdf5"} +{"episode_idx": 413, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_14.hdf5"} +{"episode_idx": 414, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_15.hdf5"} +{"episode_idx": 415, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_16.hdf5"} +{"episode_idx": 416, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_17.hdf5"} +{"episode_idx": 417, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_18.hdf5"} +{"episode_idx": 418, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_19.hdf5"} +{"episode_idx": 419, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_20.hdf5"} +{"episode_idx": 420, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_21.hdf5"} +{"episode_idx": 421, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_22.hdf5"} +{"episode_idx": 422, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_23.hdf5"} +{"episode_idx": 423, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_24.hdf5"} +{"episode_idx": 424, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_25.hdf5"} +{"episode_idx": 425, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_26.hdf5"} +{"episode_idx": 426, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_27.hdf5"} +{"episode_idx": 427, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_28.hdf5"} +{"episode_idx": 428, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_29.hdf5"} +{"episode_idx": 429, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_30.hdf5"} +{"episode_idx": 430, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_31.hdf5"} +{"episode_idx": 431, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_32.hdf5"} +{"episode_idx": 432, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_33.hdf5"} +{"episode_idx": 433, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_34.hdf5"} +{"episode_idx": 434, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_35.hdf5"} +{"episode_idx": 435, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_36.hdf5"} +{"episode_idx": 436, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_37.hdf5"} +{"episode_idx": 437, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_38.hdf5"} +{"episode_idx": 438, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_39.hdf5"} +{"episode_idx": 439, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_40.hdf5"} +{"episode_idx": 440, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_41.hdf5"} +{"episode_idx": 441, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_42.hdf5"} +{"episode_idx": 442, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_43.hdf5"} +{"episode_idx": 443, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_44.hdf5"} +{"episode_idx": 444, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_45.hdf5"} +{"episode_idx": 445, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_46.hdf5"} +{"episode_idx": 446, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_47.hdf5"} +{"episode_idx": 447, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_48.hdf5"} +{"episode_idx": 448, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_49.hdf5"} +{"episode_idx": 449, "task": "pour_coke", "raw_path": "pour_coke_v8/episode_50.hdf5"} +{"episode_idx": 450, "task": "pour_coke", "raw_path": "pour_coke_v9/episode_1.hdf5"} +{"episode_idx": 451, "task": "pour_coke", "raw_path": "pour_coke_v9/episode_2.hdf5"} +{"episode_idx": 452, "task": "pour_coke", "raw_path": "pour_coke_v9/episode_3.hdf5"} +{"episode_idx": 453, "task": "pour_coke", "raw_path": "pour_coke_v9/episode_4.hdf5"} +{"episode_idx": 454, "task": "pour_coke", "raw_path": "pour_coke_v9/episode_5.hdf5"} +{"episode_idx": 455, "task": "pour_coke", "raw_path": "pour_coke_v9/episode_6.hdf5"} +{"episode_idx": 456, "task": "pour_coke", "raw_path": "pour_coke_v9/episode_7.hdf5"} +{"episode_idx": 457, "task": "pour_coke", "raw_path": "pour_coke_v9/episode_8.hdf5"} +{"episode_idx": 458, "task": "pour_coke", "raw_path": "pour_coke_v9/episode_9.hdf5"} diff --git a/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/tasks.parquet b/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/tasks.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f0f43b451192a95d8863070bd6f041264e649cbf Binary files /dev/null and b/assets/examples/fastumi/fastumi_single_arm/pour_coke/meta/tasks.parquet differ diff --git a/assets/examples/fastumi/fastumi_single_arm/pour_coke/videos/observation.image.right_main_camera_rgb/chunk-000/file-000.mp4 b/assets/examples/fastumi/fastumi_single_arm/pour_coke/videos/observation.image.right_main_camera_rgb/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6cac1f60a305209778d67c3e6dc8daf6644aa2e9 --- /dev/null +++ b/assets/examples/fastumi/fastumi_single_arm/pour_coke/videos/observation.image.right_main_camera_rgb/chunk-000/file-000.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81b3aa9d26f522ef357b575b5fa5c3755a2c1d3d0823004fae166c12268adff1 +size 410211 diff --git a/assets/examples/fractal20220817_data/data/chunk-000/file-000.parquet b/assets/examples/fractal20220817_data/data/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7b751d27de881df062960dde05c52c0c6679cede Binary files /dev/null and b/assets/examples/fractal20220817_data/data/chunk-000/file-000.parquet differ diff --git a/assets/examples/fractal20220817_data/manifest.json b/assets/examples/fractal20220817_data/manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..46b609a09289d3b1a8d19de295ac5fd273fd5552 --- /dev/null +++ b/assets/examples/fractal20220817_data/manifest.json @@ -0,0 +1,19 @@ +{ + "source_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/fractal20220817_data", + "source_episode": 2, + "local_episode": 0, + "source_index_range": [ + 181, + 206 + ], + "frames": 25, + "source_task_indices": [ + 2 + ], + "tasks": [ + "pick apple from white bowl" + ], + "videos": [ + "videos/observation.images.image/chunk-000/file-000.mp4" + ] +} diff --git a/assets/examples/fractal20220817_data/meta/episodes/chunk-000/file-000.parquet b/assets/examples/fractal20220817_data/meta/episodes/chunk-000/file-000.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f59a6f3e1a883a2a1ce1c1cad8182cde28cc7224 Binary files /dev/null and b/assets/examples/fractal20220817_data/meta/episodes/chunk-000/file-000.parquet differ diff --git a/assets/examples/fractal20220817_data/meta/info.json b/assets/examples/fractal20220817_data/meta/info.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa934d08fedc7dd79b236254f4b1e9fdba70e0f --- /dev/null +++ b/assets/examples/fractal20220817_data/meta/info.json @@ -0,0 +1,111 @@ +{ + "codebase_version": "v3.0", + "robot_type": "google_robot", + "total_episodes": 1, + "total_frames": 25, + "total_tasks": 1, + "chunks_size": 1000, + "data_files_size_in_mb": 100, + "video_files_size_in_mb": 200, + "fps": 3, + "splits": { + "train": "0:1" + }, + "data_path": "data/chunk-{chunk_index:03d}/file-{file_index:03d}.parquet", + "video_path": "videos/{video_key}/chunk-{chunk_index:03d}/file-{file_index:03d}.mp4", + "features": { + "observation.images.image": { + "dtype": "video", + "shape": [ + 256, + 320, + 3 + ], + "names": [ + "height", + "width", + "rgb" + ], + "info": { + "video.height": 256, + "video.width": 320, + "video.codec": "av1", + "video.pix_fmt": "yuv420p", + "video.is_depth_map": false, + "video.fps": 3, + "video.channels": 3, + "has_audio": false + } + }, + "observation.state": { + "dtype": "float32", + "shape": [ + 8 + ], + "names": { + "motors": [ + "x", + "y", + "z", + "rx", + "ry", + "rz", + "rw", + "gripper" + ] + } + }, + "action": { + "dtype": "float32", + "shape": [ + 7 + ], + "names": { + "motors": [ + "x", + "y", + "z", + "roll", + "pitch", + "yaw", + "gripper" + ] + } + }, + "timestamp": { + "dtype": "float32", + "shape": [ + 1 + ], + "names": null + }, + "frame_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "episode_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + }, + "task_index": { + "dtype": "int64", + "shape": [ + 1 + ], + "names": null + } + } +} diff --git a/assets/examples/fractal20220817_data/meta/stats.json b/assets/examples/fractal20220817_data/meta/stats.json new file mode 100644 index 0000000000000000000000000000000000000000..1eef42cfc5d40ee972fe74b7df833ac81d123584 --- /dev/null +++ b/assets/examples/fractal20220817_data/meta/stats.json @@ -0,0 +1,501 @@ +{ + "episode_index": { + "min": [ + 0 + ], + "max": [ + 87211 + ], + "mean": [ + 43687.47439045023 + ], + "std": [ + 25161.433385824046 + ], + "count": [ + 3786400 + ], + "q01": [ + 43687.47439045023 + ], + "q10": [ + 43687.47439045023 + ], + "q50": [ + 43687.47439045023 + ], + "q90": [ + 43687.47439045023 + ], + "q99": [ + 43687.47439045023 + ] + }, + "timestamp": { + "min": [ + 0.0 + ], + "max": [ + 216.33333333333334 + ], + "mean": [ + 8.793377526582546 + ], + "std": [ + 8.044006875848723 + ], + "count": [ + 3786400 + ], + "q01": [ + 0.019118576965716935 + ], + "q10": [ + 1.6091612757572917 + ], + "q50": [ + 8.711732604248807 + ], + "q90": [ + 15.944715586261385 + ], + "q99": [ + 17.56541885375544 + ] + }, + "frame_index": { + "min": [ + 0 + ], + "max": [ + 649 + ], + "mean": [ + 26.380132579759145 + ], + "std": [ + 24.13202062754607 + ], + "count": [ + 3786400 + ], + "q01": [ + 0.057355731096994454 + ], + "q10": [ + 4.827483827437957 + ], + "q50": [ + 26.133017805039607 + ], + "q90": [ + 47.83414675861042 + ], + "q99": [ + 52.696256561060494 + ] + }, + "observation.state": { + "min": [ + -0.4436439275741577, + -0.9970501065254211, + -0.006579156965017319, + -0.8643477559089661, + -0.7079970240592957, + -0.7688722014427185, + -0.4999994933605194, + 0.0 + ], + "max": [ + 1.0534898042678833, + 0.48018959164619446, + 1.6896663904190063, + 0.9999993443489075, + 0.9999874830245972, + 0.9554369449615479, + 0.9914546012878418, + 1.0 + ], + "mean": [ + 0.5582046028643536, + -0.08324323429555819, + 0.7708198142579612, + -0.24752762586024715, + 0.4959921774813562, + 0.09255771451332732, + 0.2094189021656009, + 0.4261956376121668 + ], + "std": [ + 0.12440319979560374, + 0.11571359396154758, + 0.245894357010222, + 0.5132342575426315, + 0.5223439100950532, + 0.16665986288677848, + 0.27617123861598164, + 0.45387534501253995 + ], + "count": [ + 3786400 + ], + "q01": [ + 0.408963256356151, + -0.23145526982140982, + 0.5935708434664594, + -0.64105388872015, + 0.07054327665810646, + -0.12634555347420393, + -0.09304175200127232, + 2.199466787180472e-05 + ], + "q10": [ + 0.4428541524403959, + -0.19135969252525398, + 0.6060833036444486, + -0.5704538476018918, + 0.17980850596534304, + -0.06904302420001972, + -0.009461047441014432, + 0.007395965520886004 + ], + "q50": [ + 0.5487401925001684, + -0.07700300862678738, + 0.7668444825525346, + -0.2660065536555911, + 0.5238047414960931, + 0.09922105682856339, + 0.22535350104783028, + 0.3058211724179186 + ], + "q90": [ + 0.6841599570198298, + 0.004299989107951686, + 0.9432045306065185, + 0.0771875547069565, + 0.7731956966295638, + 0.23161431007069846, + 0.39114578975783715, + 0.9598732336358585 + ], + "q99": [ + 0.6963673544148412, + 0.01644260025180769, + 1.000799300047394, + 0.16702012495529103, + 0.8204194259313271, + 0.26266766557977766, + 0.42878282571630694, + 0.9689303361079542 + ] + }, + "index": { + "min": [ + 0 + ], + "max": [ + 3786399 + ], + "mean": [ + 1893199.5 + ], + "std": [ + 1093039.5296297753 + ], + "count": [ + 3786400 + ], + "q01": [ + 1893173.1772231467 + ], + "q10": [ + 1893177.9473516669 + ], + "q50": [ + 1893199.2556619933 + ], + "q90": [ + 1893220.9548600307 + ], + "q99": [ + 1893225.8161719302 + ] + }, + "task_index": { + "min": [ + 0 + ], + "max": [ + 598 + ], + "mean": [ + 143.40194855271227 + ], + "std": [ + 132.58829846252522 + ], + "count": [ + 3786400 + ], + "q01": [ + 143.40194855271227 + ], + "q10": [ + 143.40194855271227 + ], + "q50": [ + 143.40194855271227 + ], + "q90": [ + 143.4019485527123 + ], + "q99": [ + 143.4019485527123 + ] + }, + "action": { + "min": [ + -2.0204520225524902, + -5.497899532318115, + -2.031663417816162, + -1.569917917251587, + -1.569892168045044, + -1.570419430732727, + 0.0 + ], + "max": [ + 2.9984593391418457, + 22.09052848815918, + 2.7507524490356445, + 1.570636510848999, + 1.5321086645126343, + 1.5691522359848022, + 1.0 + ], + "mean": [ + 0.006986742172085009, + 0.006266400645656175, + -0.01262561945294701, + 0.0433347717660516, + -0.005755843126369116, + 0.0009133710921551621, + 0.5354204546016359 + ], + "std": [ + 0.06943342817907261, + 0.05987580077761333, + 0.0738429111488631, + 0.15697640193310985, + 0.1319237681096443, + 0.1463219227874845, + 0.49874381422625697 + ], + "count": [ + 3786400 + ], + "q01": [ + -0.12221348527912608, + -0.09609365736658501, + -0.15100898896136394, + -0.2185607741848074, + -0.2965577733895619, + -0.3014479600294951, + 0.027980111681782445 + ], + "q10": [ + -0.0660911694758722, + -0.04637928472530103, + -0.09462124253497246, + -0.1003841425402165, + -0.1575445953883812, + -0.1529437071341076, + 0.030033117533913266 + ], + "q50": [ + 0.004505050576012514, + 0.003205040885008195, + -0.010057818458173608, + 0.01762125013928238, + 0.00035042878599716304, + -0.000527023381541794, + 0.6160485027421297 + ], + "q90": [ + 0.08102828768740676, + 0.06249624540697094, + 0.06695364553140282, + 0.22499897324678897, + 0.129533823452432, + 0.15507156750538909, + 0.9883197184355382 + ], + "q99": [ + 0.1298058026221481, + 0.10786538799511407, + 0.13116332716025397, + 0.4193381877333532, + 0.2438051563655085, + 0.3007432678772545, + 0.9977885722166413 + ] + }, + "observation.images.image": { + "min": [ + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ], + [ + [ + 0.0 + ] + ] + ], + "max": [ + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ], + [ + [ + 1.0 + ] + ] + ], + "mean": [ + [ + [ + 0.3957461965864481 + ] + ], + [ + [ + 0.2862227698842343 + ] + ], + [ + [ + 0.2067759676712127 + ] + ] + ], + "std": [ + [ + [ + 0.03559225425494718 + ] + ], + [ + [ + 0.02675087532213864 + ] + ], + [ + [ + 0.023519621913043576 + ] + ] + ], + "count": [ + 3741992 + ], + "q01": [ + [ + [ + 0.0024661240356684635 + ] + ], + [ + [ + 0.0007401937026290603 + ] + ], + [ + [ + 4.542013660436245e-05 + ] + ] + ], + "q10": [ + [ + [ + 0.04357633479047807 + ] + ], + [ + [ + 0.02722412507799719 + ] + ], + [ + [ + 0.006618760305829143 + ] + ] + ], + "q50": [ + [ + [ + 0.4320693558492684 + ] + ], + [ + [ + 0.30031301324089443 + ] + ], + [ + [ + 0.20738113717924858 + ] + ] + ], + "q90": [ + [ + [ + 0.6783619024719743 + ] + ], + [ + [ + 0.5187178433507462 + ] + ], + [ + [ + 0.40475137588524834 + ] + ] + ], + "q99": [ + [ + [ + 0.8303000355708119 + ] + ], + [ + [ + 0.6617710723529358 + ] + ], + [ + [ + 0.5504632600300524 + ] + ] + ] + } +} \ No newline at end of file diff --git a/assets/examples/fractal20220817_data/meta/tasks.parquet b/assets/examples/fractal20220817_data/meta/tasks.parquet new file mode 100644 index 0000000000000000000000000000000000000000..00801f3a4976976ae261b2ecbb3d39bb877e78d1 Binary files /dev/null and b/assets/examples/fractal20220817_data/meta/tasks.parquet differ diff --git a/assets/examples/fractal20220817_data/videos/observation.images.image/chunk-000/file-000.mp4 b/assets/examples/fractal20220817_data/videos/observation.images.image/chunk-000/file-000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..64dc701d070f15d8652bbfe1af7363731811b4d3 --- /dev/null +++ b/assets/examples/fractal20220817_data/videos/observation.images.image/chunk-000/file-000.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:942fbfe83173ab9382417208f700a1b6520bc47858cde053ff83c73b78189146 +size 252563 diff --git a/assets/examples/manifest.json b/assets/examples/manifest.json new file mode 100644 index 0000000000000000000000000000000000000000..e6199ca8acb23a9e9ea9ba74949b72f1258c4f5f --- /dev/null +++ b/assets/examples/manifest.json @@ -0,0 +1,84 @@ +{ + "fractal20220817_data": { + "source_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/fractal20220817_data", + "source_episode": 2, + "local_episode": 0, + "source_index_range": [ + 181, + 206 + ], + "frames": 25, + "source_task_indices": [ + 2 + ], + "tasks": [ + "pick apple from white bowl" + ], + "videos": [ + "videos/observation.images.image/chunk-000/file-000.mp4" + ] + }, + "droid_plus_lerobot_640x360_20260412": { + "source_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/droid_plus_lerobot_640x360_20260412/success", + "source_episode": 0, + "local_episode": 0, + "source_index_range": [ + 0, + 471 + ], + "frames": 471, + "source_task_indices": [ + 0 + ], + "tasks": [ + "Pour the contents of the yellow cup into the bowl | Pour the contents of the yellow cup into the pink bowl | Pour the contents of the yellow cup into the bowl" + ], + "videos": [ + "videos/observation.image.wrist_image_left/chunk-000/file-000.mp4", + "videos/observation.image.exterior_image_1_left/chunk-000/file-000.mp4", + "videos/observation.image.exterior_image_2_left/chunk-000/file-000.mp4" + ] + }, + "robomind_franka": { + "source_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1", + "source_episode": 0, + "local_episode": 0, + "source_index_range": [ + 0, + 120 + ], + "frames": 120, + "source_task_indices": [ + 0 + ], + "tasks": [ + "Close the trash can by pressing down from the back." + ], + "videos": [ + "videos/observation.images.camera_top/chunk-000/file-000.mp4", + "videos/observation.images.camera_left/chunk-000/file-000.mp4", + "videos/observation.images.camera_right/chunk-000/file-000.mp4" + ] + }, + "robomind_franka_dual": { + "source_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water", + "source_episode": 0, + "local_episode": 0, + "source_index_range": [ + 0, + 574 + ], + "frames": 574, + "source_task_indices": [ + 0 + ], + "tasks": [ + "pour water with both arm" + ], + "videos": [ + "videos/observation.images.camera_front/chunk-000/file-000.mp4", + "videos/observation.images.camera_left/chunk-000/file-000.mp4", + "videos/observation.images.camera_right/chunk-000/file-000.mp4" + ] + } +} diff --git a/cosmos-framework/cosmos_framework/__init__.py b/cosmos-framework/cosmos_framework/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosmos-framework/cosmos_framework/data/__init__.py b/cosmos-framework/cosmos_framework/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosmos-framework/cosmos_framework/data/imaginaire/__init__.py b/cosmos-framework/cosmos_framework/data/imaginaire/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosmos-framework/cosmos_framework/data/imaginaire/webdataset/__init__.py b/cosmos-framework/cosmos_framework/data/imaginaire/webdataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosmos-framework/cosmos_framework/data/imaginaire/webdataset/augmentors/__init__.py b/cosmos-framework/cosmos_framework/data/imaginaire/webdataset/augmentors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosmos-framework/cosmos_framework/data/imaginaire/webdataset/augmentors/augmentor.py b/cosmos-framework/cosmos_framework/data/imaginaire/webdataset/augmentors/augmentor.py new file mode 100644 index 0000000000000000000000000000000000000000..57d20050f842ac1e9f00a19815063b2f051d214e --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/imaginaire/webdataset/augmentors/augmentor.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +from collections.abc import Iterable +from typing import Any, Generator, Optional + + +class Augmentor: + def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None: + r"""Base augmentor class + + Args: + input_keys (list): List of input keys + output_keys (list): List of output keys + args (dict): Arguments associated with the augmentation + """ + self.input_keys = input_keys + self.output_keys = output_keys + self.args = args + + def __call__(self, *args: Any, **kwds: Any) -> Any: + raise ValueError("Augmentor not implemented") + + +class IterableAugmentor: + def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None: + r"""Base augmentor class + + Args: + input_keys (list): List of input keys + output_keys (list): List of output keys + args (dict): Arguments associated with the augmentation + """ + self.input_keys = input_keys + self.output_keys = output_keys + self.args = args + self.is_generator = True + + def __call__(self, data: Iterable) -> Generator: + r"""Example usage: + + for data_dict in data: + # Do something to data_dict + data_dict["input"] = data_dict["raw_sequence"][:, :-1] + data_dict["target"] = data_dict["raw_sequence"][:, 1:] + # Skip sample if needed + if data_dict["input"].shape[1] < 64: + continue + # Construct a generator + yield data_dict + """ + raise ValueError("Augmentor not implemented") diff --git a/cosmos-framework/cosmos_framework/data/vfm/__init__.py b/cosmos-framework/cosmos_framework/data/vfm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/__init__.py b/cosmos-framework/cosmos_framework/data/vfm/action/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..503ec1b18d584ba1c349360dedbe6951e3216df6 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/action_normalization.py b/cosmos-framework/cosmos_framework/data/vfm/action/action_normalization.py new file mode 100644 index 0000000000000000000000000000000000000000..d553161f70b23180a62981fbbc4727e8460b1ca5 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/action_normalization.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Action normalization helpers.""" + +import json +from pathlib import Path + +import numpy as np +import torch + +from cosmos_framework.utils import log + + +def load_action_stats(stats_path: str, stats_key: str = "global") -> dict[str, np.ndarray]: + """Load pre-computed action normalization stats from a JSON file.""" + path = Path(stats_path) + if not path.exists(): + raise FileNotFoundError(f"Action normalization stats not found at {stats_path}.") + log.info(f"Loading action normalization stats from {stats_path}") + with path.open("r") as f: + raw = json.load(f) + if stats_key in raw: + raw = raw[stats_key] + if not isinstance(raw, dict): + raise TypeError(f"Action normalization stats block {stats_key!r} in {stats_path} must be a dict.") + elif stats_key != "global": + raise KeyError(f"Action normalization stats block {stats_key!r} not found in {stats_path}.") + stat_keys = {"mean", "std", "min", "max", "q01", "q99"} + return {k: np.array(v, dtype=np.float32) for k, v in raw.items() if k in stat_keys} + + +def normalize_action( + action: torch.Tensor, + method: str, + stats: dict[str, torch.Tensor], +) -> torch.Tensor: + """Normalize action tensor (all dimensions including gripper).""" + if method == "quantile": + q01, q99 = stats["q01"], stats["q99"] + denom = (q99 - q01).clamp(min=1e-8) + return (2.0 * (action - q01) / denom - 1.0).clamp(-1.0, 1.0) + if method == "meanstd": + return (action - stats["mean"]) / stats["std"].clamp(min=1e-8) + if method == "minmax": + lo, hi = stats["min"], stats["max"] + denom = (hi - lo).clamp(min=1e-8) + return (2.0 * (action - lo) / denom - 1.0).clamp(-1.0, 1.0) + raise ValueError(f"Unknown normalization method: {method!r}") diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/action_spec.py b/cosmos-framework/cosmos_framework/data/vfm/action/action_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..c9c0f384684d6f37eb1e8fa7dcdb97de57bd6ade --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/action_spec.py @@ -0,0 +1,235 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Action-vector specification: per-dim type label + idle thresholds. + +Single concept: every column of an action vector has a :class:`DimType` label. +Idle detection iterates by type and applies the matching algorithm: + + POS → ‖action[pos_idx]‖ per arm < eps_t + ROT → distance(rot, identity) per group < eps_r + GRIPPER → max |Δgripper| < eps_g (frame 0 idle by convention) + JOINT → max |Δjoint| < joint_threshold (frame 0 idle) + RESERVED → ignored + +An :class:`ActionSpec` is just ``names`` + ``types`` + ``rotation_format``. +Build one declaratively via :func:`build_action_spec` from DSL components:: + + build_action_spec(Pos(), Rot("rot6d"), Gripper()) # 10D single arm + build_action_spec(Pos(), Rot("rot6d")) # 9D no gripper + build_action_spec(Joint(n=14, label="arm"), # 30D joint-space + Joint(n=14, label="end"), + Joint(n=2, label="gripper")) + build_action_spec(Pos(prefix="left"), Rot("rot6d", "left"), Gripper(prefix="left"), + Pos(prefix="right"), Rot("rot6d", "right"), Gripper(prefix="right")) + +Naming convention: + Default ``pos_x``, ``rot_0``, ``gripper``, ``arm_0`` ... + With ``prefix="left"`` (idempotent on trailing ``_``): ``left_pos_x`` ... +""" + +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from typing import ClassVar + +from cosmos_framework.data.vfm.action.pose_utils import ( + RotationConvention, + _identity_rotation_vector, +) + + +class DimType(str, Enum): + """Per-column action-dim category (drives idle detection).""" + + POS = "pos" + ROT = "rot" + GRIPPER = "gripper" + JOINT = "joint" + RESERVED = "reserved" + + +@dataclass(frozen=True, slots=True) +class ActionSpec: + """Structural description of an action vector: names + per-dim types. + + All ROT dims share a single ``rotation_format``; mixed formats in one spec + are not supported (raise at build time). + + This struct contains no detection thresholds — those are passed at call + time to :func:`compute_idle_frames` so each dataset can tune them + independently of layout. + """ + + names: list[str] + types: list[DimType] + rotation_format: RotationConvention = "rot6d" + + @property + def dim(self) -> int: + return len(self.names) + + +# --------------------------------------------------------------------------- +# DSL components +# --------------------------------------------------------------------------- + + +def _join_prefix(prefix: str, name: str) -> str: + """Join ``prefix`` and ``name`` with a single ``_``; idempotent on trailing ``_``.""" + return name if not prefix else f"{prefix.rstrip('_')}_{name}" + + +@dataclass(frozen=True) +class Pos: + """Translation block. + + Default 3D (``pos_x``, ``pos_y``, ``pos_z``). For planar tasks (e.g. PushT) + use ``Pos(dim=2)`` → ``pos_x``, ``pos_y``. ``dim >= 4`` falls back to + indexed names ``pos_0``, ``pos_1``, ... + """ + + dim: int = 3 + prefix: str = "" + type: ClassVar[DimType] = DimType.POS + + def names(self) -> list[str]: + if self.dim <= 3: + return [_join_prefix(self.prefix, f"pos_{c}") for c in "xyz"[: self.dim]] + return [_join_prefix(self.prefix, f"pos_{i}") for i in range(self.dim)] + + +@dataclass(frozen=True) +class Rot: + """Rotation block; ``format`` selects the encoding. + + Supported formats and per-dim names: + + - ``rot6d`` → 6 dims, ``rot_0`` ... ``rot_5`` (identity ``[1,0,0,0,1,0]``) + - ``rot9d`` → 9 dims, ``rot_0`` ... ``rot_8`` (identity ``[1,0,0,0,1,0,0,0,1]``) + - ``euler_xyz`` → 3 dims, ``roll``, ``pitch``, ``yaw`` (identity ``[0,0,0]``) + - ``axisangle`` → 3 dims, ``axang_x/y/z`` (identity ``[0,0,0]``) + - ``quat_xyzw`` / ``quat_wxyz`` → 4 dims, ``quat_x/y/z/w`` in declared order + """ + + format: RotationConvention = "rot6d" + prefix: str = "" + type: ClassVar[DimType] = DimType.ROT + + @property + def rotation_format(self) -> RotationConvention: + return self.format + + @property + def dim(self) -> int: + return _identity_rotation_vector(self.format).shape[0] + + def names(self) -> list[str]: + if self.format == "euler_xyz": + return [_join_prefix(self.prefix, c) for c in ("roll", "pitch", "yaw")] + if self.format == "axisangle": + return [_join_prefix(self.prefix, f"axang_{c}") for c in "xyz"] + if self.format.startswith("quat_"): + order = self.format.split("_", 1)[1] # "xyzw" or "wxyz" + return [_join_prefix(self.prefix, f"quat_{c}") for c in order] + return [_join_prefix(self.prefix, f"rot_{i}") for i in range(self.dim)] + + +@dataclass(frozen=True) +class Gripper: + """1D gripper command (binary 0/1 or continuous). Detected by frame-diff.""" + + prefix: str = "" + type: ClassVar[DimType] = DimType.GRIPPER + + @property + def dim(self) -> int: + return 1 + + def names(self) -> list[str]: + return [_join_prefix(self.prefix, "gripper")] + + +@dataclass(frozen=True) +class Joint: + """``n`` joint commands. Detected by frame-diff against ``joint_threshold``.""" + + n: int = 0 + label: str = "joint" + prefix: str = "" + type: ClassVar[DimType] = DimType.JOINT + + @property + def dim(self) -> int: + return self.n + + def names(self) -> list[str]: + return [_join_prefix(self.prefix, f"{self.label}_{i}") for i in range(self.n)] + + +@dataclass(frozen=True) +class Reserved: + """``n`` dims counted in ``action_dim`` but ignored by idle detection.""" + + n: int = 0 + label: str = "reserved" + prefix: str = "" + type: ClassVar[DimType] = DimType.RESERVED + + @property + def dim(self) -> int: + return self.n + + def names(self) -> list[str]: + return [_join_prefix(self.prefix, f"{self.label}_{i}") for i in range(self.n)] + + +# --------------------------------------------------------------------------- +# Builder +# --------------------------------------------------------------------------- + + +# Type alias for any DSL component. Not a runtime check — only annotation hint. +Component = Pos | Rot | Gripper | Joint | Reserved + + +def build_action_spec(*components: Component) -> ActionSpec: + """Compose ``components`` into an :class:`ActionSpec`. + + Each component contributes its ``names()`` and replicates its ``type`` for + every column it occupies. The first ROT component's ``rotation_format`` + is captured for the whole spec; mixing formats raises ``ValueError``. + """ + names: list[str] = [] + types: list[DimType] = [] + rotation_format: RotationConvention | None = None + + for c in components: + names.extend(c.names()) + types.extend([c.type] * c.dim) + if c.type == DimType.ROT: + fmt = c.rotation_format # type: ignore[union-attr] + if rotation_format is None: + rotation_format = fmt + elif rotation_format != fmt: + raise ValueError(f"Mixed rotation_format in one ActionSpec: {rotation_format!r} vs {fmt!r}") + + return ActionSpec( + names=names, + types=types, + rotation_format=rotation_format or "rot6d", + ) + + +__all__ = [ + "ActionSpec", + "Component", + "DimType", + "Gripper", + "Joint", + "Pos", + "Reserved", + "Rot", + "build_action_spec", +] diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/av_dataset.py b/cosmos-framework/cosmos_framework/data/vfm/action/av_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..d35753fdbf96947228e33de39430ee8746aa23ad --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/av_dataset.py @@ -0,0 +1,1026 @@ +"""AV Dataset for Action training. + +This module provides an IterableDataset for AV data +that loads S3 storage containing tar files with video, action trajectories, +and route waypoints. + +Data format expected: + s3://bucket/path/*.tar -> pkl files containing: + - video: mp4 bytes + - action: pickled dict with: + - history_xyz: (history_length, 3) tensor - position history + - history_quat: (history_length, 4) tensor - quaternion history + - future_xyz: (future_length, 3) tensor - position future + - future_quat: (future_length, 4) tensor - quaternion future + - route: pickled numpy array of shape (num_waypoints, 3) - route waypoints in ego frame + +Action format: 7D pose per timestep [x, y, z, qw, qx, qy, qz] (3 position + 4 quaternion) +""" + +import io +import json +import math +import pickle +import random +import tarfile +from pathlib import Path +from typing import Iterator, Literal + +import numpy as np +import torch +import torchvision +import torchvision.transforms.functional as F +from scipy.spatial.transform import Rotation +from torch.utils.data import IterableDataset + +# import torch.multiprocessing +# torch.multiprocessing.set_sharing_strategy("file_system") +from cosmos_framework.utils import log +from cosmos_framework.utils.easy_io import easy_io +from cosmos_framework.data.vfm.action.camera_dataset import get_target_size_and_crop +from cosmos_framework.data.vfm.action.domain_utils import get_domain_id +from cosmos_framework.data.vfm.action.pose_utils import ( + RotationConvention, + build_abs_pose_from_components, + pose_abs_to_rel, +) + + +def decode_video_bytes( + video_bytes: bytes, + resolution: str | None = None, + history_len: float | None = None, + future_len: float | None = None, + original_history_steps: int | None = None, +) -> tuple[torch.Tensor, float]: + """Decode video from mp4 bytes using torchvision.io. + + Args: + video_bytes: Raw mp4 video bytes. + resolution: Target resolution for video frames (e.g. "256", "480"). If None, keeps original resolution. + history_len: Desired history length in seconds. Used with future_len to slice video. + future_len: Desired future length in seconds. Used with history_len to slice video. + original_history_steps: Number of frames in the original history portion of the video. + + Returns: + Tuple of (video tensor in (C, T, H, W) uint8 format, original fps). + + Note: + The video structure is [history_frames | future_frames]. When slicing: + - History portion: take last (history_len * fps) frames from video[:original_history_steps] + - Future portion: take first (future_len * fps) frames from video[original_history_steps:] + This mirrors the slicing in process_action_trajectory. + """ + # Write bytes to a temporary file for torchvision.io.read_video + import tempfile + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=True) as tmp_file: + tmp_file.write(video_bytes) + tmp_file.flush() + + # Read video using torchvision.io + # Returns: (video_frames, audio_frames, info) + # video_frames shape: (T, H, W, C) uint8 + video_frames, _, info = torchvision.io.read_video(tmp_file.name, pts_unit="sec") + + original_fps = info.get("video_fps", 30.0) + + # Slice video to match history_len and future_len + # Video structure: [history_frames | future_frames] + if original_history_steps is None: + raise ValueError("original_history_steps is required to slice video") + + # Split video at history/future boundary + history_video = video_frames[:original_history_steps] + future_video = video_frames[original_history_steps:] + + # Slice history (take last N frames) + if history_len is not None: + history_steps = int(history_len * original_fps) + if history_steps > history_video.shape[0]: + raise ValueError( + f"Requested history_len={history_len}s ({history_steps} frames at {original_fps}Hz) " + f"exceeds available history video ({history_video.shape[0]} frames)" + ) + history_video = history_video[-history_steps:] + + # Slice future (take first N frames) + if future_len is not None: + future_steps = int(future_len * original_fps) + if future_steps > future_video.shape[0]: + raise ValueError( + f"Requested future_len={future_len}s ({future_steps} frames at {original_fps}Hz) " + f"exceeds available future video ({future_video.shape[0]} frames)" + ) + future_video = future_video[:future_steps] + + # Concatenate sliced portions + video_frames = torch.cat([history_video, future_video], dim=0) # [T,H,W,C] + + # Convert from (T, H, W, C) to (T, C, H, W) + video_tensor = video_frames.permute(0, 3, 1, 2) # [T,C,H,W] + + # Resize and Crop if resolution is provided + if resolution is not None: + T, C, H, W = video_tensor.shape + # get_target_size_and_crop expects (resolution, current_H, current_W) + new_H, new_W, target_canvas_H, target_canvas_W = get_target_size_and_crop(resolution, H, W) + + # Resize if needed + if new_H != H or new_W != W: + video_tensor = F.resize( + video_tensor, [new_H, new_W], interpolation=F.InterpolationMode.BICUBIC, antialias=True + ) + + # Center Crop + if new_H != target_canvas_H or new_W != target_canvas_W: + video_tensor = F.center_crop(video_tensor, [target_canvas_H, target_canvas_W]) + + # Convert to uint8 if not already + if video_tensor.dtype != torch.uint8: + video_tensor = video_tensor.to(torch.uint8) + + # Convert from (T, C, H, W) to (C, T, H, W) + video_tensor = video_tensor.permute(1, 0, 2, 3) # [C,T,H,W] + + return video_tensor, original_fps + + +# 3x3 rotation from car convention (x=forward, y=left, z=up) +# to OpenCV convention (x=right, y=down, z=forward). +# Mapping: new_x = -old_y, new_y = -old_z, new_z = old_x +CAR_TO_OPENCV_ROTATION = np.array( + [[0, -1, 0], [0, 0, -1], [1, 0, 0]], + dtype=np.float32, +) + + +def process_action_trajectory( + action_data: dict, + history_len: float | None = None, + future_len: float | None = None, + fps: int = 10, + rotation_format: Literal["9D", "rot6d", "quat_xyzw", "euler_xyz"] = "9D", + pose_convention: Literal["backward_anchored", "backward_framewise"] = ("backward_framewise"), + scale: float = 1.0, + rotation_scale: float = 1.0, + max_translation_norm: float | None = None, + align_opencv_pose: bool = False, +): + """Process action trajectories from action data dict. + + Args: + action_data: Dict with: + - history_xyz: (T_hist, 3) tensor - position history + - history_quat: (T_hist, 4) tensor - quaternion history + - future_xyz: (T_fut, 3) tensor - position future + - future_quat: (T_fut, 4) tensor - quaternion future + history_len: Desired history length in seconds. + future_len: Desired future length in seconds. + fps: Frames per second, used to compute number of steps from time durations. + align_opencv_pose: If True, transform poses from car convention + (x=forward, y=left, z=up) to OpenCV convention (x=right, y=down, z=forward). + NOTE: av_v2_* data is already in OpenCV convention, DO NOT apply this transformation! + + Returns: + Tuple of (history_action, future_action). + Both actions are torch.Tensor of shape (T, 7) in [x, y, z, qw, qx, qy, qz] format. + + Note: + History steps = history_len * fps, same as future steps = future_len * fps. + For example, with history_len=1.0s and fps=10, we get 10 history steps. + """ + # Extract and ensure tensors + history_xyz = action_data["history_xyz"] + history_quat = action_data["history_quat"] + future_xyz = action_data["future_xyz"] + future_quat = action_data["future_quat"] + + # Convert to tensors if needed + if not isinstance(history_xyz, torch.Tensor): + history_xyz = torch.tensor(history_xyz, dtype=torch.float32) # [T_hist,3] + if not isinstance(history_quat, torch.Tensor): + history_quat = torch.tensor(history_quat, dtype=torch.float32) # [T_hist,4] + if not isinstance(future_xyz, torch.Tensor): + future_xyz = torch.tensor(future_xyz, dtype=torch.float32) # [T_fut,3] + if not isinstance(future_quat, torch.Tensor): + future_quat = torch.tensor(future_quat, dtype=torch.float32) # [T_fut,4] + + # Slice history to desired length (take the last N steps) + if history_len is not None: + history_steps = int(history_len * fps) + available_history = history_xyz.shape[0] + if history_steps > available_history: + raise ValueError( + f"Requested history_len={history_len}s ({history_steps} steps at {fps}Hz) " + f"exceeds available history ({available_history} steps)" + ) + history_xyz = history_xyz[-history_steps:] + history_quat = history_quat[-history_steps:] + + # Slice future to desired length (take the first N steps) + if future_len is not None: + future_steps = int(future_len * fps) + available_future = future_xyz.shape[0] + if future_steps > available_future: + raise ValueError( + f"Requested future_len={future_len}s ({future_steps} steps at {fps}Hz) " + f"exceeds available future ({available_future} steps)" + ) + future_xyz = future_xyz[:future_steps] + future_quat = future_quat[:future_steps] + + # Concatenate to form full trajectory + # history_xyz: (T_hist, 3) + # history_quat: (T_hist, 4) [w, x, y, z] + all_xyz = torch.cat([history_xyz, future_xyz], dim=0) # [T,3] + all_quat = torch.cat([history_quat, future_quat], dim=0) # [T,4] + + poses_abs = build_abs_pose_from_components( + all_xyz, + all_quat, + "quat_wxyz", + ) + + if align_opencv_pose: + poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ CAR_TO_OPENCV_ROTATION.T + + actions = pose_abs_to_rel( + poses_abs, + rotation_format=rotation_format, + pose_convention=pose_convention, + translation_scale=scale, + rotation_scale=rotation_scale, + ) + + if max_translation_norm is not None: + trans_norms = np.linalg.norm(actions[:, :3], axis=1) + if trans_norms.max() > max_translation_norm: + return None + + actions = torch.from_numpy(actions) # [T-1,action_dim] + + # Split back + # history_action has one less action than history_xyz because the first action is the initial pose + history_action = actions[: len(history_xyz) - 1] + future_action = actions[len(history_xyz) - 1 :] + + return history_action, future_action + + +def add_route_noise( + route: torch.Tensor, + lat_noise_range: float = 0.0, + long_noise_range: float = 0.0, + point_wise_noise: float = 0.0, +) -> torch.Tensor: + """Add noise to route waypoints for data augmentation. + + Applies two types of noise: + 1. Uniform lateral/longitudinal shift (same offset for all waypoints in a sample) + 2. Per-point Gaussian noise (independent per waypoint) + + Both noise types leave the z-axis unchanged. NaN waypoints (padding) are preserved. + + Args: + route: (T, 3) tensor of route waypoints in XYZ. + lat_noise_range: Half-range for uniform lateral (Y) noise. + long_noise_range: Half-range for uniform longitudinal (X) noise. + point_wise_noise: Standard deviation of per-point Gaussian noise. + + Returns: + Noisy route tensor of shape (T, 3). + """ + if lat_noise_range > 0 or long_noise_range > 0: + shift = torch.rand(3) * torch.tensor([2 * long_noise_range, 2 * lat_noise_range, 0.0]) - torch.tensor( # [3] + [long_noise_range, lat_noise_range, 0.0] + ) + route = route + shift[None, :] + + if point_wise_noise > 0: + noise = torch.randn(route.shape[0], 3) * point_wise_noise # [T,3] + noise[..., -1] = 0.0 + route = route + noise + + return route + + +def apply_route_dropout( + route: torch.Tensor, + dropout_rate: float = 0.5, + tail_dropout_rate: float = 0.3, +) -> torch.Tensor: + """Apply dropout masking to route waypoints for data augmentation. + + Three dropout behaviours, applied sequentially: + 1. With probability ``dropout_rate``, mask **all** waypoints. + Otherwise, randomly mask the first K waypoints (K ~ Uniform(0, T)). + 2. With probability ``tail_dropout_rate``, additionally mask waypoints + from a random index in [T//2, T) onward. + + Masked waypoints are set to NaN so downstream code can detect padding + via ``torch.isnan``. + + Args: + route: (T, 3) tensor of route waypoints. + dropout_rate: Probability of fully disabling the route. + tail_dropout_rate: Probability of additional tail dropout. + + Returns: + Route tensor with dropout applied, shape (T, 3). + """ + T = route.shape[0] + mask = torch.isnan(route[..., 0]) # [T] existing padding + + if random.uniform(0, 1) < dropout_rate: + dropout_mask = torch.ones(T, dtype=torch.bool) # [T] + else: + dropout_idx = random.randint(0, T) + dropout_mask = torch.arange(T) < dropout_idx # [T] + + if random.uniform(0, 1) < tail_dropout_rate: + tail_idx = random.randint(T // 2, T - 1) if T > 1 else 0 + tail_dropout_mask = torch.arange(T) >= tail_idx # [T] + dropout_mask = dropout_mask | tail_dropout_mask + + mask = mask | dropout_mask + route = route.clone() + route[mask] = float("nan") + return route + + +def _classify_displacement(dx: float, dy: float, move_threshold: float = 0.1) -> str: + """Classify a 2D displacement vector into a direction label. + + Uses the angle of the displacement in ego frame (X=forward, Y=left) to + determine the driving direction. + + Args: + dx: Forward displacement (positive = forward). + dy: Lateral displacement (positive = left). + move_threshold: Minimum displacement magnitude (meters) to count as movement. + + Returns: + One of: "go forward", "turn left", "turn right", "go backward", "stay". + """ + dist = math.sqrt(dx * dx + dy * dy) + if dist < move_threshold: + return "stay" + + angle_deg = math.degrees(math.atan2(dy, dx)) + + if -45 <= angle_deg <= 45: + return "go forward" + elif 45 < angle_deg <= 135: + return "turn left" + elif -135 <= angle_deg < -45: + return "turn right" + else: + return "go backward" + + +def classify_trajectory_to_text( + trajectory: torch.Tensor, + move_threshold: float = 0.1, + min_segment_steps: int = 2, +) -> str: + """Classify a trajectory in ego frame into a brief semantic text description. + + Classifies each consecutive point pair independently, groups consecutive + identical labels, filters out noisy short groups, and joins distinct + phases with "then". + + Works with any (T, 3) path in ego frame — route waypoints or pose + positions returned by :func:`compute_future_trajectory_in_ego_frame`. + + Args: + trajectory: (T, 3) tensor of positions in ego frame (X=forward, Y=left, Z=up). + The first point is treated as the starting position. + move_threshold: Minimum per-step displacement (meters) to count as movement. + min_segment_steps: Minimum consecutive steps required for a direction label to + be kept; shorter runs are treated as noise and dropped. + + Returns: + A description such as "go forward", "stay then go forward", + "turn left then go forward", or "stay" when the trajectory is empty + or all NaN. + """ + valid_mask = ~torch.isnan(trajectory[:, 0]) + valid_pts = trajectory[valid_mask] + + if len(valid_pts) < 2: + return "stay" + + # Classify every consecutive point pair + step_labels: list[str] = [] + for i in range(len(valid_pts) - 1): + dx = valid_pts[i + 1, 0].item() - valid_pts[i, 0].item() + dy = valid_pts[i + 1, 1].item() - valid_pts[i, 1].item() + step_labels.append(_classify_displacement(dx, dy, move_threshold)) + + # Group consecutive identical labels with their counts + groups: list[tuple[str, int]] = [] + for label in step_labels: + if groups and label == groups[-1][0]: + groups[-1] = (label, groups[-1][1] + 1) + else: + groups.append((label, 1)) + + # Filter out groups shorter than min_segment_steps to suppress noise + if len(groups) > 1: + filtered = [(lbl, cnt) for lbl, cnt in groups if cnt >= min_segment_steps] + if not filtered: + # All groups are short — keep the longest one + filtered = [max(groups, key=lambda g: g[1])] + groups = filtered + + # Deduplicate consecutive identical labels (may arise after filtering) + result = [groups[0][0]] + for label, _ in groups[1:]: + if label != result[-1]: + result.append(label) + + return " then ".join(result) + + +def compute_future_trajectory_in_ego_frame( + action_data: dict, + history_len: float | None = None, + future_len: float | None = None, + fps: int = 10, +) -> torch.Tensor: + """Compute future trajectory positions in the ego coordinate frame. + + Transforms absolute future xyz positions so that the origin is the last + history pose and axes align with ego frame (X=forward, Y=left, Z=up). + + Args: + action_data: Dict with keys ``history_xyz``, ``history_quat``, + ``future_xyz`` (and optionally ``future_quat``). + history_len: History length in seconds for slicing. If *None*, uses all. + future_len: Future length in seconds for slicing. If *None*, uses all. + fps: Frames per second. + + Returns: + (T, 3) float tensor of future positions in ego frame. + """ + history_xyz = action_data["history_xyz"] + history_quat = action_data["history_quat"] + future_xyz = action_data["future_xyz"] + + if not isinstance(history_xyz, torch.Tensor): + history_xyz = torch.tensor(history_xyz, dtype=torch.float32) + if not isinstance(history_quat, torch.Tensor): + history_quat = torch.tensor(history_quat, dtype=torch.float32) + if not isinstance(future_xyz, torch.Tensor): + future_xyz = torch.tensor(future_xyz, dtype=torch.float32) + + # Slice to match the requested durations + if history_len is not None: + history_steps = int(history_len * fps) + history_xyz = history_xyz[-history_steps:] + history_quat = history_quat[-history_steps:] + if future_len is not None: + future_steps = int(future_len * fps) + future_xyz = future_xyz[:future_steps] + + # Current pose = last history frame + current_pos = history_xyz[-1] # (3,) + current_quat_wxyz = history_quat[-1] # (4,) [w, x, y, z] + + # Scipy expects [x, y, z, w] + quat_xyzw = current_quat_wxyz[[1, 2, 3, 0]].numpy() + rot_world_to_ego = Rotation.from_quat(quat_xyzw).inv() + + # Translate then rotate into ego frame + future_rel = (future_xyz - current_pos[None, :]).numpy() + future_ego = rot_world_to_ego.apply(future_rel).astype(np.float32) + + return torch.from_numpy(future_ego) + + +class AVDataset(IterableDataset): + """AV dataset that reads tar files from S3 using wdinfo.json.""" + + def __init__( + self, + root: str | list[str] = "s3://nv-00-10206-robot/cosmos3_action_data/av_v2_02182026_wdinfo/", + credential_path: str = "credentials/gcp_training.secret", + resolution: str | None = None, + fps: int = 10, + mode: str = "policy", + embodiment_type: str = "av", + split: str = "train", + seed: int = 0, + shuffle: bool = True, + history_len: float | None = None, + future_len: float | None = None, + rotation_format: RotationConvention = "rot9d", + pose_convention: Literal["backward_anchored", "backward_framewise"] = ("backward_framewise"), + route_lat_noise_range: float = 0.0, + route_long_noise_range: float = 0.0, + route_point_wise_noise: float = 0.0, + route_dropout: bool = False, + route_dropout_rate: float = 0.0, + route_tail_dropout_rate: float = 0.0, + include_route_in_prompt: bool = True, + use_semantic_route_prompt: bool = False, + translation_scale: float = 1.0, + rotation_scale: float = 1.0, + max_action_translation_norm: float | None = None, + align_opencv_pose: bool = False, + # When True, use a separate domain ID for inverse dynamics / policy modes + # so that DomainAwareLinear learns different projections for anchored (conditioning) + # vs framewise (generation) action representations. + mode_aware_domain: bool = False, + inv_embodiment_type: str = "av_inv", + ): + """Initialize AVDataset. + + Args: + root: S3 path (or list of S3 paths) to wdinfo directories containing train/val subdirectories with wdinfo.json files. + credential_path: Path to JSON file containing S3 credentials. + resolution: Target resolution for video frames (e.g. "256", "480"). If None, keeps original resolution. + fps: Target frames per second for video and actions. + mode: Training mode ('policy', 'forward_dynamics', 'inverse_dynamics', 'image2video', 'joint'). + embodiment_type: Embodiment type for domain ID. + split: Dataset split ('train', 'val', or 'full'). + seed: Random seed for shuffling. + shuffle: Whether to shuffle tar files during iteration (for training). + history_len: Desired history length in seconds. If None, uses all available history. + future_len: Desired future length in seconds. If None, uses all available future. + rotation_format: Rotation convention for actions (e.g. "rot9d", "rot6d", "euler_xyz"). + pose_convention: Pose format for actions (e.g. "backward_framewise", "backward_framewise"). + route_lat_noise_range: Half-range for uniform lateral (Y) noise on route waypoints. + route_long_noise_range: Half-range for uniform longitudinal (X) noise on route waypoints. + route_point_wise_noise: Std-dev of per-waypoint Gaussian noise on route. + route_dropout: Whether to apply random waypoint dropout on route during training. + route_dropout_rate: Probability of fully masking the route (used when route_dropout=True). + route_tail_dropout_rate: Probability of additional tail dropout (used when route_dropout=True). + include_route_in_prompt: Whether to include route waypoints as text in the prompt. + use_semantic_route_prompt: When True and include_route_in_prompt is True, replace raw + coordinate waypoints with a brief semantic description (e.g. "go forward then turn left"). + translation_scale: Scale factor applied to the translation block of the encoded action. + rotation_scale: Scale factor applied to the rotation block of the encoded action + (uniform scalar, preserves rotation-block geometry). Pass the same value to + `pose_rel_to_abs` when decoding. + max_action_translation_norm: If set, discard the sample when any per-frame + scaled translation L2 norm exceeds this value. Acts as an outlier + filter to prevent loss spikes from extreme camera motion. + align_opencv_pose: If True, transform pose rotations from car body-frame + convention (x=forward, y=left, z=up) to OpenCV camera convention + (x=right, y=down, z=forward) before computing relative actions. + mode_aware_domain: When True, inverse_dynamics/policy modes use a separate domain ID. + inv_embodiment_type: Embodiment type string for the inverse domain ID. + """ + super().__init__() + + if isinstance(root, str): + root = [root] + self.roots = [r.rstrip("/") for r in root] + self.credential_path = credential_path + self.resolution = resolution + self.fps = fps + self.mode = mode + self.split = split.lower().strip() + self.seed = seed + self.shuffle = shuffle + self._epoch = 0 + self.history_len = history_len + self.future_len = future_len + self.rotation_format: RotationConvention = rotation_format + self.pose_convention: Literal["absolute", "backward_anchored", "backward_framewise"] = pose_convention + self.route_lat_noise_range = route_lat_noise_range + self.route_long_noise_range = route_long_noise_range + self.route_point_wise_noise = route_point_wise_noise + self.route_dropout = route_dropout + self.route_dropout_rate = route_dropout_rate + self.route_tail_dropout_rate = route_tail_dropout_rate + self.include_route_in_prompt = include_route_in_prompt + self.use_semantic_route_prompt = use_semantic_route_prompt + self.translation_scale = translation_scale + self.rotation_scale = rotation_scale + self.max_action_translation_norm = max_action_translation_norm + self.align_opencv_pose = align_opencv_pose + # Get domain ID for this embodiment + self.domain_id = get_domain_id(embodiment_type) + self.mode_aware_domain = mode_aware_domain + self.domain_id_inv = get_domain_id(inv_embodiment_type) if mode_aware_domain else self.domain_id + + # Validate mode + valid_modes = ["joint", "forward_dynamics", "inverse_dynamics", "policy", "image2video"] + if mode not in valid_modes: + raise ValueError(f"mode must be one of {valid_modes}, got {mode}") + + # Validate split + if self.split not in {"train", "val", "valid", "validation", "eval", "test", "full"}: + raise ValueError(f"Unsupported {split=}. Use train/val/full.") + + # Roots may be S3/GCS-compatible paths or local packaged demo roots. + for r in self.roots: + if not (r.startswith("s3://") or r.startswith("gs://") or Path(r).exists()): + raise ValueError(f"root must be an S3/GCS path or existing local path, got: {r}") + + # Configure S3 backend using easy_io + self._setup_s3_backend() + + # Load tar files from wdinfo.json + self._tar_files: list[str] = [] + self._total_key_count: int = 0 + self._chunk_size: int = 10 + + self._load_wdinfo() + + log.info( + f"Initialized AVDataset: root={self.roots}, split={self.split}, " + f"resolution={resolution}, fps={fps}, mode={mode}, " + f"num_tar_files={len(self._tar_files)}, " + f"total_samples={self._total_key_count}" + ) + + def _setup_s3_backend(self) -> None: + """Configure the easy_io S3 backend. Called in __init__ and __iter__ for worker processes.""" + easy_io.set_s3_backend( + backend_args={ + "backend": "s3", + "path_mapping": None, + "s3_credential_path": self.credential_path, + } + ) + + def _load_wdinfo(self) -> None: + """Load wdinfo.json for the current split from all roots and build tar file list. + + Supports two directory layouts per root: + - Split-based: ``{root}/train/wdinfo.json``, ``{root}/val/wdinfo.json`` + - Flat: ``{root}/wdinfo.json`` (treated as train-only) + + Split-based paths are tried first; the flat path is used as a fallback + only when no split-based wdinfo was found and the requested split + includes "train". + """ + self._tar_files = [] + self._total_key_count = 0 + + for root in self.roots: + is_remote = root.startswith(("s3://", "gs://")) + bucket = root.replace("s3://", "").replace("gs://", "").split("/")[0] if is_remote else "" + + # Determine which splits we need + if self.split in {"val", "valid", "validation", "eval", "test"}: + target_splits = ["val"] + elif self.split == "train": + target_splits = ["train"] + elif self.split == "full": + target_splits = ["train", "val"] + else: + raise ValueError(f"Unsupported split: {self.split}") + + # Try split-based layout first ({root}/train/wdinfo.json, {root}/val/wdinfo.json) + wdinfo_entries: list[tuple[str, dict]] = [] + for split_name in target_splits: + split_path = f"{root}/{split_name}/wdinfo.json" + try: + if is_remote: + wdinfo_entries.append((split_path, json.loads(easy_io.get(split_path)))) + else: + path = Path(split_path) + if path.exists(): + wdinfo_entries.append((split_path, json.loads(path.read_text()))) + except Exception: + pass + + # Fall back to flat layout ({root}/wdinfo.json, treated as train-only) + if not wdinfo_entries and "train" in target_splits: + flat_path = f"{root}/wdinfo.json" + try: + if is_remote: + wdinfo_entries.append((flat_path, json.loads(easy_io.get(flat_path)))) + else: + path = Path(flat_path) + if path.exists(): + wdinfo_entries.append((flat_path, json.loads(path.read_text()))) + except Exception: + pass + + if not wdinfo_entries: + log.warning(f"No wdinfo.json found for root={root}, split={self.split}") + + for wdinfo_path, wdinfo in wdinfo_entries: + log.info(f"Loading wdinfo from: {wdinfo_path}") + + # Extract metadata + self._chunk_size = wdinfo.get("chunk_size", 10) + data_root = wdinfo.get("root", "") + data_list = wdinfo.get("data_list", []) + + if not data_list: + log.warning(f"No tar files found in wdinfo: {wdinfo_path}") + continue + + # Reconstruct full tar paths. + if is_remote: + tar_root = f"s3://{bucket}/{data_root}".rstrip("/") + else: + tar_root = str((Path(root) / data_root).resolve()) if data_root else str(Path(root).resolve()) + tar_paths = [f"{tar_root}/{filename}" for filename in data_list] + self._tar_files.extend(tar_paths) + + # Accumulate total sample count + self._total_key_count += wdinfo.get("total_key_count", len(data_list) * self._chunk_size) + + log.info( + f"Loaded {len(data_list)} tar files from wdinfo, " + f"with {wdinfo.get('total_key_count', len(data_list) * self._chunk_size)} samples" + ) + + if not self._tar_files: + raise RuntimeError(f"No tar files found in wdinfo at {self.roots}") + + def __len__(self) -> int: + """Return the estimated number of samples in the current split.""" + return self._total_key_count + + def _process_sample(self, pkl_data: dict, key: str, global_idx: int) -> dict | None: + """Process a single sample from pkl data. + + Args: + pkl_data: Dictionary with 'video' (bytes) and 'action' (pickled dict). + key: Sample key (basename without .pkl). + global_idx: Global sample index for __key__. + + Returns: + Processed sample dictionary, or None if the sample should be discarded. + """ + # Extract video bytes + video_bytes = pkl_data.get("video") + if video_bytes is None: + raise RuntimeError(f"No video found for key {key}") + + # Extract action data + action_bytes = pkl_data.get("action") + if action_bytes is None: + raise RuntimeError(f"Missing action data for key {key}") + + action_data = pickle.loads(action_bytes) + + # Extract route data + route_bytes = pkl_data.get("route") + if route_bytes is not None: + route_data = pickle.loads(route_bytes) + if not isinstance(route_data, torch.Tensor): + route = torch.tensor(route_data, dtype=torch.float32) # [num_waypoints,3] + else: + route = route_data.float() # [num_waypoints,3] + else: + log.warning(f"No route found for key {key}") + route = torch.full((20, 3), float("nan")) # [20,3] + + # Apply route augmentations during training + if self.split == "train": + route = add_route_noise( + route, + lat_noise_range=self.route_lat_noise_range, + long_noise_range=self.route_long_noise_range, + point_wise_noise=self.route_point_wise_noise, + ) + if self.route_dropout: + route = apply_route_dropout( + route, + dropout_rate=self.route_dropout_rate, + tail_dropout_rate=self.route_tail_dropout_rate, + ) + + # Get original history frame count for video slicing + original_history_steps = len(action_data["history_xyz"]) + + # Decode video + video, _ = decode_video_bytes( + video_bytes, + resolution=self.resolution, + history_len=self.history_len, + future_len=self.future_len, + original_history_steps=original_history_steps, + ) + + # Determine mode for this sample + if self.mode == "joint": + mode = random.choice(["forward_dynamics", "inverse_dynamics", "policy"]) + # mode = random.choice(["policy", "image2video"]) + else: + mode = self.mode + + # Process actions + action_result = process_action_trajectory( + action_data, + history_len=self.history_len, + future_len=self.future_len, + fps=self.fps, + rotation_format=self.rotation_format, + pose_convention=self.pose_convention, + scale=self.translation_scale, + rotation_scale=self.rotation_scale, + max_translation_norm=self.max_action_translation_norm, + align_opencv_pose=self.align_opencv_pose, + ) + if action_result is None: + return None + history_action, future_action = action_result + + # Combine and pad actions + combined_action = torch.cat([history_action, future_action], dim=0) # [T_hist+T_fut,action_dim] + + # FPS as tensor + fps_tensor = torch.tensor(self.fps, dtype=torch.long) # scalar + + # Key as tensor + key_tensor = torch.tensor([global_idx], dtype=torch.long) # [1] + + # Compute actual history/future lengths from data + actual_history_length = history_action.shape[0] + actual_future_length = future_action.shape[0] + + # Generate prompt based on actual data lengths + history_duration = actual_history_length / self.fps + future_duration = actual_future_length / self.fps + + prompt = "You are an autonomous vehicle planning system. " + if self.include_route_in_prompt and mode == "policy": # only include route in prompt for policy mode + if self.use_semantic_route_prompt: + future_ego = compute_future_trajectory_in_ego_frame( + action_data, self.history_len, self.future_len, self.fps + ) + trajectory_desc = classify_trajectory_to_text(future_ego) + prompt += f"Please {trajectory_desc}. " + else: + num_waypoints = route.shape[0] + waypoints_str = ", ".join( + "nan" if torch.isnan(wp[0]) else f"({wp[0]:.2f}, {wp[1]:.2f}, {wp[2]:.2f})" for wp in route + ) + prompt += ( + f"The navigation route has {num_waypoints} waypoints " + f"(XYZ in ego frame with X=forward, Y=left, Z=up): " + f"[{waypoints_str}]. A nan waypoint means that waypoint is not available. " + ) + # prompt += f"Predict the future {future_duration:.1f}s action trajectory at {self.fps}Hz." + + # Select domain ID: use inverse domain for generation modes when mode_aware_domain is on + if self.mode_aware_domain and mode in ["inverse_dynamics", "policy"]: + domain_id = self.domain_id_inv + else: + domain_id = self.domain_id + + sample = { + "video": video, + "action": combined_action, + "action_history": history_action, + "action_future": future_action, + "route": route, + "conditioning_fps": fps_tensor, + "prompt": prompt, + "ai_caption": prompt, + "mode": mode, + "__key__": key_tensor, + "domain_id": torch.tensor(domain_id, dtype=torch.long), + "history_length": actual_history_length, + "future_length": actual_future_length, + "viewpoint": "ego_view", + } + return sample + + def __iter__(self) -> Iterator[dict[str, torch.Tensor | str | int]]: + """Iterate over the dataset, loading tar files from S3.""" + # Re-configure S3 backend in case this is running in a worker process after unpickling + self._setup_s3_backend() + + # Optionally shuffle tar files for training + tar_files = list(self._tar_files) + if self.shuffle: + rng = random.Random(self.seed + self._epoch) + rng.shuffle(tar_files) + self._epoch += 1 + + global_idx = 0 + + for tar_path in tar_files: + try: + # Read tar file bytes. Local packaged demo paths should not be + # decoded as text through easy_io.get(). + if tar_path.startswith(("s3://", "gs://")): + tar_bytes = easy_io.get_bytes(tar_path) + tar_cm = tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:*") + else: + tar_cm = tarfile.open(tar_path, mode="r:*") + + with tar_cm as tar: + for member in tar.getmembers(): + if not member.name.endswith(".pkl"): + continue + + try: + # Extract and process the sample + f_member = tar.extractfile(member) + if f_member is None: + log.warning(f"Failed to extract {member.name} from {tar_path}") + continue + + try: + pkl_data = pickle.load(f_member) + finally: + f_member.close() + + key = member.name.rsplit(".", 1)[0] + + sample = self._process_sample(pkl_data, key, global_idx) + if sample is not None: + yield sample + global_idx += 1 + + except Exception as e: + log.warning(f"Failed to process sample {member.name} from {tar_path}: {e}") + continue + + except Exception as e: + log.warning(f"Failed to read tar file {tar_path}: {e}") + continue + + +# PYTHONPATH=. python projects/cosmos3/vfm/datasets/action/av_dataset.py +if __name__ == "__main__": + import json as _json + import os + import time + + import torchvision + + from cosmos_framework.data.vfm.action.pose_utils import pose_rel_to_abs + + _ACTION_SCALE = 1.35 + _ROTATION_SCALE = 1.0 + _ROTATION_FORMAT = "rot6d" + _POSE_CONVENTION = "backward_framewise" + + dataset = AVDataset( + root=[ + # "s3://nv-00-10206-robot/cosmos3_action_data/av_02182026_wdinfo/", + # "s3://nv-00-10206-robot/cosmos3_action_data/av_03292026_wdinfo/", + "s3://nv-00-10206-robot/cosmos3_action_data/av_v2_02182026_wdinfo/", + "s3://nv-00-10206-robot/cosmos3_action_data/av_v2_03292026_wdinfo/", + ], + split="train", + shuffle=True, + fps=10, + mode="inverse_dynamics", + history_len=0.1, + future_len=6.0, + rotation_format=_ROTATION_FORMAT, + pose_convention=_POSE_CONVENTION, + translation_scale=_ACTION_SCALE, + rotation_scale=_ROTATION_SCALE, + resolution="480", + include_route_in_prompt=False, + use_semantic_route_prompt=False, + # align_opencv_pose=False, + ) + dataset_iter = iter(dataset) + os.makedirs("temp", exist_ok=True) + + for i in range(5): + print(f"==================== Sample {i} ====================") + _t0 = time.time() + data = next(dataset_iter) + _t1 = time.time() + print(f"{'Loading time':<25}: {_t1 - _t0:.2f}s") + + print(f"{'video shape':<25}: {data['video'].shape}") # [C,T,H,W] + print(f"{'action shape':<25}: {data['action'].shape}") # [T,action_dim] + print(f"{'action_history shape':<25}: {data['action_history'].shape}") + print(f"{'action_future shape':<25}: {data['action_future'].shape}") + print(f"{'route shape':<25}: {data['route'].shape}") + print(f"{'history_length':<25}: {data['history_length']}") + print(f"{'future_length':<25}: {data['future_length']}") + print(f"{'conditioning_fps':<25}: {data['conditioning_fps'].item()}") + print(f"{'mode':<25}: {data['mode']}") + print(f"{'domain_id':<25}: {data['domain_id'].item()}") + print(f"{'prompt':<25}: {data['prompt']}") + + # save video + video = data["video"].permute(1, 0, 2, 3) # [C,T,H,W] -> [T,C,H,W] + video_path = f"temp/av_sample_{i}.mp4" + torchvision.io.write_video( + video_path, video.permute(0, 2, 3, 1).numpy(), fps=data["conditioning_fps"].item() + ) # expects (T, H, W, C) + print(f"Saved video to {video_path}") + + # reconstruct absolute poses from relative actions and save as json + camera_poses = pose_rel_to_abs( + data["action"].float().numpy(), + rotation_format=_ROTATION_FORMAT, + pose_convention=_POSE_CONVENTION, + translation_scale=_ACTION_SCALE, + rotation_scale=_ROTATION_SCALE, + ) + pose_path = f"temp/av_sample_{i}_camera.json" + with open(pose_path, "w") as f: + _json.dump(camera_poses.tolist(), f) + print(f"Saved camera poses to {pose_path}") diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/bridge_orig_lerobot_dataset.py b/cosmos-framework/cosmos_framework/data/vfm/action/bridge_orig_lerobot_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..4cc0a30899cfbc3b56894a404dc82a1cfa0cf1aa --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/bridge_orig_lerobot_dataset.py @@ -0,0 +1,272 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# * https://github.com/2toinf/X-VLA/blob/30090f81cf91b15da73af234ce2b098fe20590f8/datasets/domain_handler/simulations.py#L70-L93 +# * https://github.com/2toinf/X-VLA/issues/11 +# * https://github.com/2toinf/X-VLA/issues/33 +# * https://github.com/2toinf/X-VLA/issues/67 +# + +# uses identity stats (q01=-1, q99=1) on the 6D rotation dims 3..8, while +# ``"quantile_rot"`` uses the raw stats and normalizes those columns too. + +from typing import Any + +import numpy as np +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata + +from cosmos_framework.utils import log +from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import ( + ActionNormalization, + ActionSpec, + BaseActionLeRobotDataset, + Gripper, + Pos, + Rot, + build_action_spec, +) +from cosmos_framework.data.vfm.action.pose_utils import ( + PoseConvention, + build_abs_pose_from_components, + convert_rotation, + pose_abs_to_rel, +) +from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint + +# Bridge rotation decomposition: +# 1) _DEFAULT_ROTATION: raw bridge state → kinematics (MJCF/URDF) frame. +# The WidowX controller records ``R_state = R_fk @ DEFAULT_ROTATION.T``, +# so ``R_fk = R_state @ DEFAULT_ROTATION``. +# 2) _TCP_TO_FLANGE: re-reference from ee_gripper_link to gripper_link +# (pure translation in kinematics frame). See block below. +# 3) _KIN_TO_OPENCV: kinematics → OpenCV convention (for training/vis). +# The viewer undoes this before IK to recover the kinematic frame. +_DEFAULT_ROTATION = np.array( + [[0, 0, 1], [0, 1, 0], [-1, 0, 0]], + dtype=np.float32, +) +_BRIDGE_TO_OPENCV = np.array( + [[0, 0, 1], [-1, 0, 0], [0, -1, 0]], + dtype=np.float32, +) + +# --------------------------------------------------------------------------- +# TCP → flange (gripper body) offset +# --------------------------------------------------------------------------- +# The bridge dataset records EE poses at ``ee_gripper_link`` — the Interbotix +# SDK's end-effector reference, 93.6 mm past the wrist rotate body +# (``gripper_link``), roughly at the grasp center between the finger pads. +# For action learning we re-reference poses to the *wrist rotate body* +# (``gripper_link``) because: +# 1. It is the last actuated link — its pose is fully determined by joint +# angles, with no dependence on finger opening. +# 2. The ~10 cm offset reduces the lever-arm effect of small rotation +# errors on position accuracy. +# 3. Consistent with Google Robot, where we also target the gripper body. +# +# The constant below is the SE(3) transform from ``ee_gripper_link`` to +# ``gripper_link``, computed from the SimplerEnv URDF via pinocchio FK at the +# neutral configuration: +# T = oMf[ee_gripper_link]⁻¹ · oMf[gripper_link] +# It is pure translation (identity rotation) — the two frames share the +# same orientation by construction (connected via fixed joints with no +# rotational offset). +# +# Source URDF: https://github.com/simpler-env/ManiSkill2_real2sim +# → mani_skill2_real2sim/assets/descriptions/widowx_description/ +# + +# so the translation is expressed in the kinematic (MJCF) frame. +# fmt: off +_TCP_TO_FLANGE = np.array([ + [+1.0000000000, +0.0000000000, +0.0000000000, -0.0935750000], + [+0.0000000000, +1.0000000000, +0.0000000000, +0.0000000000], + [+0.0000000000, +0.0000000000, +1.0000000000, +0.0000000000], + [+0.0000000000, +0.0000000000, +0.0000000000, +1.0000000000], +], dtype=np.float32) +# fmt: on + + +class BridgeOrigLeRobotDataset(BaseActionLeRobotDataset): + """ """ + + def __init__( + self, + root: str = "", + fps: float = 5.0, + chunk_length: int = 16, + split_seed: int = 42, + split_val_ratio: float = 0.05, + split: str = "train", + mode: str = "policy", + pose_convention: PoseConvention = "backward_framewise", + action_normalization: ActionNormalization | None = None, + viewpoint: Viewpoint = "ego_view", + enable_fast_init: bool = False, + ) -> None: + """ """ + super().__init__( + fps=fps, + chunk_length=chunk_length, + split_seed=split_seed, + split_val_ratio=split_val_ratio, + split=split, + mode=mode, + embodiment_type="bridge_orig_lerobot", + viewpoint=viewpoint, + pose_convention=pose_convention, + rotation_format="rot6d", + action_normalization=action_normalization, + tolerance_s=1e-4, + enable_fast_init=enable_fast_init, + ) + # _to_opencv is the kinematics→OpenCV part only. + # The viewer undoes this before IK → recovers kinematic frame directly. + self._to_opencv = _BRIDGE_TO_OPENCV + + self._all_shard_roots = [root] + + self._delta_timestamps = { + "observation.images.image_0": [i * self._dt for i in range(0, self._chunk_length + 1)], + "observation.state": [i * self._dt for i in range(0, self._chunk_length + 1)], + "action": [i * self._dt for i in range(0, self._chunk_length)], + } + + # ------------------------------------------------------------------ + # Action computation + # ------------------------------------------------------------------ + + def _compute_absolute_action(self, sample: dict[str, Any]) -> tuple[torch.Tensor, torch.Tensor]: + """Absolute action from state + gripper from action. + + EEF xyz+rotation come from observation.state[1:]; gripper from action[:, 6]. + + Returns: + (action_tensor, initial_pose) — initial_pose is the first-frame + absolute EE pose (4×4, in the corrected OpenCV frame). + """ + state = sample["observation.state"][1:] # [T, 8] + poses_abs = build_abs_pose_from_components( + state[:, 0:3], + state[:, 3:6], + "euler_xyz", + ) + + # 1. Raw → kinematics: apply DEFAULT_ROTATION + # 2. TCP → flange: shift from ee_gripper_link to gripper_link + # 3. Kinematics → OpenCV convention (rotation only) + poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ _DEFAULT_ROTATION.astype(poses_abs.dtype) + poses_abs = poses_abs @ _TCP_TO_FLANGE.astype(poses_abs.dtype) + poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ self._to_opencv.astype(poses_abs.dtype) + + initial_pose = torch.from_numpy(poses_abs[0].copy()).float() + + translation = torch.from_numpy(poses_abs[:, :3, 3]).float() + rotation_matrix = torch.from_numpy(poses_abs[:, :3, :3]).float() + rotation = convert_rotation(rotation_matrix, input_format="matrix", output_format="rot6d").float() + + pose = torch.cat([translation, rotation], dim=-1) # [T, 9] + return torch.cat([pose, sample["action"][:, [6]]], dim=-1), initial_pose # [T, 10] + + def _compute_backward_framewise_action(self, sample: dict[str, Any]) -> tuple[torch.Tensor, torch.Tensor]: + """Body-frame (ego-frame) delta: ``T_curr^{-1} @ T_next``. + + Matches Camera/AV ``backward_framewise`` convention. Translation is in + the current frame's local coordinate system; rotation is + ``R_curr^{-1} @ R_next``. + + Returns: + (action_tensor, initial_pose) — initial_pose is the first-frame + absolute EE pose (4×4, in the corrected OpenCV frame). + """ + states = sample["observation.state"] # (chunk_length + 1, 8) + poses_abs = build_abs_pose_from_components( + states[:, 0:3], + states[:, 3:6], + "euler_xyz", + ) + + # 1. Raw → kinematics: apply DEFAULT_ROTATION + # 2. TCP → flange: shift from ee_gripper_link to gripper_link + # 3. Kinematics → OpenCV convention (rotation only) + poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ _DEFAULT_ROTATION.astype(poses_abs.dtype) + poses_abs = poses_abs @ _TCP_TO_FLANGE.astype(poses_abs.dtype) + poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ self._to_opencv.astype(poses_abs.dtype) + + initial_pose = torch.from_numpy(poses_abs[0].copy()).float() + + poses_rel = pose_abs_to_rel( + poses_abs=poses_abs, + rotation_format="rot6d", + pose_convention="backward_framewise", + ) + poses_rel_tensor = torch.from_numpy(poses_rel).float() + + return torch.cat([poses_rel_tensor, sample["action"][:, [6]]], dim=-1), initial_pose + + # ------------------------------------------------------------------ + # Normalization is handled by BaseActionLeRobotDataset. + # Stats are loaded from: + # cosmos_framework/data/vfm/action/normalizers/ + # bridge_orig_lerobot__.json + # Regenerate via ``compute_action_stats.py`` + ``debug/stats_all.sh``. + # ------------------------------------------------------------------ + + # ------------------------------------------------------------------ + # Episode filtering + # ------------------------------------------------------------------ + def _filter_valid_episodes(self, meta: LeRobotDatasetMetadata, episode_ids: list[int]) -> list[int]: + """Drop episodes whose ``tasks`` metadata is empty/whitespace. + + Narrower than the offline + ``projects/cosmos3/vfm/datasets/action/filter_bridge_dataset.py`` + (which also flags gibberish/question/non-English/patterns via + ``classify_task``). + """ + kept: list[int] = [] + dropped = 0 + for ep_id in episode_ids: + ep = meta.episodes[ep_id] + tasks = ep.get("tasks", []) + if isinstance(tasks, str): + tasks = [tasks] + has_prompt = any(t and str(t).strip() for t in (tasks or [])) + if has_prompt: + kept.append(ep_id) + else: + dropped += 1 + if dropped: + log.info(f"BridgeOrigLeRobotDataset: dropped {dropped} / {len(episode_ids)} episodes with empty prompt") + return kept + + # ------------------------------------------------------------------ + # __getitem__ + # ------------------------------------------------------------------ + + def _build_action_spec(self) -> ActionSpec: + """Bridge: 10D = ``[Pos, Rot6d, Gripper]``.""" + return build_action_spec(Pos(), Rot("rot6d"), Gripper()) + + def __getitem__(self, idx: int) -> dict[str, Any]: + """ """ + mode, _, _, sample = self._fetch_sample(idx) + + ai_caption = sample["task"] + + video = sample["observation.images.image_0"] # [T,C,H,W] + if self._pose_convention == "absolute": + action, initial_pose = self._compute_absolute_action(sample) + elif self._pose_convention == "backward_framewise": + action, initial_pose = self._compute_backward_framewise_action(sample) + else: + raise ValueError(f"Unknown pose_convention: {self._pose_convention}") + + return self._build_result( + mode=mode, video=video, action=action, ai_caption=ai_caption, initial_pose=initial_pose + ) + + @property + def action_dim(self) -> int: + """Action dimensionality: position(3) + 6D rotation(6) + gripper(1) = 10.""" + return 10 diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/camera_dataset.py b/cosmos-framework/cosmos_framework/data/vfm/action/camera_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..99ce841582939d5b789dce0dee810edc48fe3c1b --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/camera_dataset.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +VIDEO_RES_SIZE_INFO: dict[str, dict[str, tuple[int, int]]] = { + "256": {"1,1": (256, 256), "4,3": (320, 256), "3,4": (256, 320), "16,9": (320, 192), "9,16": (192, 320)}, + "480": {"1,1": (640, 640), "4,3": (736, 544), "3,4": (544, 736), "16,9": (832, 480), "9,16": (480, 832)}, +} + +def get_target_size_and_crop(resolution: str, current_H: int, current_W: int) -> tuple[int, int, int, int]: + target_resolutions = VIDEO_RES_SIZE_INFO[resolution] + current_ar = current_W / current_H + best_key = min(target_resolutions, key=lambda key: abs((int(key.split(',')[0]) / int(key.split(',')[1])) - current_ar)) + target_canvas_W, target_canvas_H = target_resolutions[best_key] + scaling_ratio = max(target_canvas_W / current_W, target_canvas_H / current_H) + return int(scaling_ratio * current_H + 0.5), int(scaling_ratio * current_W + 0.5), target_canvas_H, target_canvas_W diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py b/cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py new file mode 100644 index 0000000000000000000000000000000000000000..38bfb125fd64d97ecd57c12eaf4b02c1540c8e04 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/cosmos3_action_lerobot.py @@ -0,0 +1,1011 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Shared LeRobot adapter utilities for Action datasets. + +These helpers centralize common behavior across Action wrappers: +- deterministic train/val episode splitting +- valid per-episode index range construction +- a reusable BaseActionLeRobotDataset class with lazy init, video formatting, + and common result building +""" + +from __future__ import annotations + +import importlib +import logging as _logging +import math +import os as _os +import random +from bisect import bisect_right +from collections import OrderedDict, defaultdict +from collections.abc import Callable, Sequence +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from threading import Lock +from typing import Any, ClassVar, Literal + +import huggingface_hub.constants as _hf_const +import numpy as np +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata +from torch.utils.data import Dataset + +_hf_offline_applied = False + + +def _ensure_hf_hub_offline() -> None: + """Force HF Hub into offline mode for local-only datasets (repo_id="local"). + + Sets the ``HF_HUB_OFFLINE`` env var (for any future imports in worker + processes), patches the already-imported constant, and suppresses the + expected "Returning existing local_dir" fallback warning. + + Safe to call multiple times; only applies once per process. + """ + global _hf_offline_applied + if _hf_offline_applied: + return + if "HF_HUB_OFFLINE" not in _os.environ: + _os.environ["HF_HUB_OFFLINE"] = "1" + if not _hf_const.HF_HUB_OFFLINE: + _hf_const.HF_HUB_OFFLINE = True + _logging.getLogger("huggingface_hub._snapshot_download").setLevel(_logging.ERROR) + _hf_offline_applied = True + + +from functools import cached_property + +from cosmos_framework.utils import log +from cosmos_framework.data.vfm.action.action_normalization import ( + load_action_stats, + normalize_action, +) + +# Re-export the action_spec DSL from this module so that subclass datasets +# only need a single import block (alongside ``BaseActionLeRobotDataset``). +from cosmos_framework.data.vfm.action.action_spec import ( # noqa: F401 (re-export) + ActionSpec, + DimType, + Gripper, + Joint, + Pos, + Reserved, + Rot, + build_action_spec, +) +from cosmos_framework.data.vfm.action.domain_utils import get_domain_id +from cosmos_framework.data.vfm.action.pose_utils import compute_idle_frames +from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint +from cosmos_framework.data.vfm.action_scripts.memprofile import ( + deep_size as _deep_size, +) +from cosmos_framework.data.vfm.action_scripts.memprofile import ( + fmt_mb as _fmt_mb, +) +from cosmos_framework.data.vfm.action_scripts.memprofile import ( + log_worker_memory_breakdown, + rss_tracker, +) +from cosmos_framework.data.vfm.action_scripts.memprofile import ( + memprofile_enabled as _memprofile_enabled, +) + +# --------------------------------------------------------------------------- +# LRU-capped VideoDecoderCache +# --------------------------------------------------------------------------- +_LRU_VIDEO_CACHE_MAX_SIZE: int = 64 +_LRU_DATASET_MAX_LOADED: int = 32 +ActionNormalization = Literal["quantile", "quantile_rot", "meanstd", "minmax"] +_ACTION_NORMALIZATION_CHOICES: tuple[str, ...] = ("quantile", "quantile_rot", "meanstd", "minmax") + +_decoder_cache_patched = False + + +class _LRUVideoDecoderCache: + """Drop-in replacement for ``lerobot.datasets.video_utils.VideoDecoderCache`` + with LRU eviction. When the cache exceeds *max_size* entries the + least-recently-used decoder (and its file handle) is evicted. + """ + + def __init__(self, max_size: int = _LRU_VIDEO_CACHE_MAX_SIZE) -> None: + self._max_size = max_size + self._cache: OrderedDict[str, tuple[Any, Any]] = OrderedDict() + self._lock = Lock() + self._hits = 0 + self._misses = 0 + self._evictions = 0 + + def get_decoder(self, video_path: str) -> Any: + if importlib.util.find_spec("torchcodec"): # type: ignore[attr-defined] + from torchcodec.decoders import VideoDecoder + else: + raise ImportError("torchcodec is required but not available.") + + import fsspec + + video_path = str(video_path) + + with self._lock: + if video_path in self._cache: + self._cache.move_to_end(video_path) + self._hits += 1 + return self._cache[video_path][0] + + self._misses += 1 + # TorchCodec 0.2 accepts local filesystem paths directly, but not + # fsspec LocalFileOpener handles. Keep the older fsspec path only for + # non-local URLs/filesystems. + if video_path.startswith(("http://", "https://", "s3://", "gs://", "hf://")): + file_handle = fsspec.open(video_path).__enter__() + decoder = VideoDecoder(file_handle, seek_mode="approximate") # type: ignore[arg-type] + else: + file_handle = None + decoder = VideoDecoder(video_path, seek_mode="approximate") + self._cache[video_path] = (decoder, file_handle) + + evicted = 0 + while len(self._cache) > self._max_size: + _, (_, old_fh) = self._cache.popitem(last=False) + if old_fh is not None: + try: + old_fh.close() + except Exception: + pass + evicted += 1 + self._evictions += evicted + + if evicted and self._evictions % 50 <= evicted: + log.debug( + f"[VideoDecoderCache pid={_os.getpid()}] " + f"evicted={self._evictions} total, size={len(self._cache)}/{self._max_size}, " + f"hits={self._hits}, misses={self._misses}, " + f"hit_rate={100 * self._hits / max(1, self._hits + self._misses):.1f}%" + ) + + return decoder + + def clear(self) -> None: + with self._lock: + for _, file_handle in self._cache.values(): + try: + file_handle.close() + except Exception: + pass + self._cache.clear() + + def size(self) -> int: + with self._lock: + return len(self._cache) + + +def _patch_decoder_cache(max_size: int = _LRU_VIDEO_CACHE_MAX_SIZE) -> None: + """Replace the module-level ``_default_decoder_cache`` in LeRobot with an + LRU-capped version to prevent unbounded memory growth in workers.""" + global _decoder_cache_patched + if _decoder_cache_patched: + return + + import lerobot.datasets.video_utils as _vu + + lru_cache = _LRUVideoDecoderCache(max_size=max_size) + _vu._default_decoder_cache = lru_cache + _decoder_cache_patched = True + log.debug(f"Patched LeRobot VideoDecoderCache with LRU max_size={max_size}") + + +def _parallel_map( + fn: Callable[[Any], Any], + items: list[Any], + *, + max_workers: int, + label: str, +) -> list[Any]: + """Thread-pool ``map`` — returns results in input order. + + Intended for IO-bound prefetch (``LeRobotDatasetMetadata`` loads, + parquet column reads). Preserves item-order so callers can ``zip`` + with their ``indices`` / ``roots`` list. Skips the thread pool + entirely when there is 0 or 1 task — avoids per-worker + ``ThreadPoolExecutor`` setup cost and log spam under + ``shard_across_workers=True`` where each worker typically gets + only 1-2 shards. + """ + if not items: + return [] + if len(items) == 1 or max_workers <= 1: + return [fn(items[0])] if len(items) == 1 else [fn(x) for x in items] + log.info(f"{label}: {len(items)} tasks (workers={max_workers})") + with ThreadPoolExecutor(max_workers=max_workers) as ex: + return list(ex.map(fn, items)) + + +def split_episode_ids(total_episodes: int, seed: int, val_ratio: float, split: str) -> list[int]: + """Create deterministic random episode ids for train/val/full splits.""" + num_val = int(round(total_episodes * val_ratio)) + g = torch.Generator().manual_seed(seed) + episode_ids = torch.randperm(total_episodes, generator=g).tolist() + + if split == "train": + return episode_ids[num_val:] + if split == "val": + return episode_ids[:num_val] + return episode_ids + + +def build_episode_spans( + episodes: Any, + episode_ids: Sequence[int], + chunk_length: int, + sample_stride: int = 1, +) -> tuple[list[tuple[int, int, int]], int, int]: + """Build valid episode spans for LeRobot frame queries. + + Returns: + - episode spans as ``(episode_id, sample_start, valid_len)`` + - total valid sample count across selected episodes + - total raw frame count across selected episodes + """ + assert sample_stride >= 1, f"sample_stride must be >= 1, got {sample_stride}" + + dataset_from_index = list(episodes["dataset_from_index"]) + dataset_to_index = list(episodes["dataset_to_index"]) + length = list(episodes["length"]) + + spans: list[tuple[int, int, int]] = [] + valid_count = 0 + sample_count = 0 + for episode_id in episode_ids: + start = dataset_from_index[episode_id] + stop = dataset_to_index[episode_id] + raw_valid_len = stop - start - chunk_length + if raw_valid_len > 0: + valid_len = (raw_valid_len + sample_stride - 1) // sample_stride + spans.append((episode_id, start, valid_len)) + valid_count += valid_len + sample_count += int(length[episode_id]) + + return spans, valid_count, sample_count + + +def _normalize_split(split: str) -> str: + """Normalize split name to one of ``'train'``, ``'val'``, ``'full'``.""" + s = split.lower().strip() + if s in {"val", "valid", "validation", "eval", "test"}: + return "val" + if s in {"train", "full"}: + return s + raise ValueError(f"Unsupported {split=}. Use train/val/full.") + + +class BaseActionLeRobotDataset(Dataset): + """Reusable base class for Action LeRobot-backed map-style datasets. + + Subclasses typically: + 1) call ``_register_source`` to register one or more LeRobot sources + 2) implement ``__getitem__`` for dataset-specific sample parsing + 3) call ``_build_result`` to assemble the return dict + """ + + # Applied as: R_opencv = R_native @ _to_opencv + # Subclasses override in __init__; default is identity (no correction). + + # Bundled normalization stats directory. Stats are committed at + # ``<_NORMALIZERS_DIR>/__.json`` (flat + # layout matching the existing UMI files) and produced by + # ``projects/cosmos3/vfm/datasets/action/compute_action_stats.py``. + # Subclasses that need a different filename scheme can override + # :meth:`_normalizer_filename`. + _NORMALIZERS_DIR: ClassVar[Path] = Path(__file__).parent / "normalizers" + + def __init__( + self, + *, + fps: float, + chunk_length: int, + split_seed: int, + split_val_ratio: float, + split: str, + mode: str, + embodiment_type: str, + viewpoint: Viewpoint, + pose_convention: str | None = None, + rotation_format: str | None = None, + action_normalization: ActionNormalization | None = None, + tolerance_s: float = 1e-4, + max_loaded_datasets: int = _LRU_DATASET_MAX_LOADED, + skip_video_loading: bool = False, + sample_stride: int = 1, + enable_fast_init: bool = False, + fast_init_max_workers: int = 64, + ) -> None: + super().__init__() + _ensure_hf_hub_offline() + _patch_decoder_cache() + self._memprofile = _memprofile_enabled() + + assert sample_stride >= 1, f"sample_stride must be >= 1, got {sample_stride}" + assert fast_init_max_workers >= 1, f"fast_init_max_workers must be >= 1, got {fast_init_max_workers}" + assert action_normalization is None or action_normalization in _ACTION_NORMALIZATION_CHOICES, ( + f"action_normalization must be None or one of {_ACTION_NORMALIZATION_CHOICES}, got {action_normalization!r}" + ) + + with rss_tracker(f"{self.__class__.__name__}.__init__", enabled=self._memprofile): + self._fps = fps + self._dt = 1.0 / fps + self._chunk_length = chunk_length + self._split_seed = split_seed + self._split_val_ratio = split_val_ratio + self._split = _normalize_split(split) + self._mode = mode + self._embodiment_type = embodiment_type + self._viewpoint: Viewpoint = viewpoint + self._pose_convention = pose_convention + self._rotation_format = rotation_format + self._action_normalization = action_normalization + # Lazy-loaded stats cache, populated on first call to + # :meth:`_normalize_action`. Per-process (workers get their own). + self._norm_stats: dict[str, torch.Tensor] | None = None + self._tolerance_s = tolerance_s + self._max_loaded_datasets = max_loaded_datasets + self._skip_video_loading = skip_video_loading + self._sample_stride = sample_stride + self._enable_fast_init = enable_fast_init + self._fast_init_max_workers = fast_init_max_workers + self._delta_timestamps: dict[str, list[float]] = {} + self._to_opencv: np.ndarray | dict[str, np.ndarray] = np.eye(3, dtype=np.float32) + + if pose_convention is None: + log.warning( + f"{self.__class__.__name__}: pose_convention is not set. " + "Consider specifying 'backward_framewise' or 'backward_anchored'." + ) + + self._datasets: list[LeRobotDataset | None] = [] + self._dataset_build_args: list[dict[str, Any] | None] = [] + self._loaded_lru: OrderedDict[int, None] = OrderedDict() + + # -- Flat index structures (populated by _append_index_records) -- + # Together these two lists form a searchable map from a flat + # global index to (dataset, row, episode, frame). One entry per + # episode span across *all* registered sources. + # + # _episode_records[i] = (ds_idx, sample_start, valid_len, episode_id) + # ds_idx – which source dataset (index into _datasets) + # sample_start – first row of this span in that dataset's table + # valid_len – number of usable frames in this span + # episode_id – the episode this span belongs to + # + # _episode_cum_ends[i] = running total of valid_len through span i + # Used for O(log N) lookup via bisect_right in _resolve_index. + self._episode_records: list[tuple[int, int, int, int]] = [] + self._episode_cum_ends: list[int] = [] + self._num_valid_indices = 0 + self._domain_id = get_domain_id(self._embodiment_type) + + # Deferred-init shard roots — a list of root paths. + # Subclasses populate this in __init__; _register_sources() + # reads _delta_timestamps and _tolerance_s from self (both + # initialised above, with _delta_timestamps overridden by + # each subclass). + # ActionUnifiedIterableDataset.assign_worker uses len() for + # round-robin shard distribution and _register_sources(indices) + # for deferred loading. When empty, shard distribution is + # skipped (every worker iterates the full dataset). + self._all_shard_roots: list[str] = [] + + # -- public properties --------------------------------------------------- + + @property + def fps(self) -> float: + return self._fps + + @property + def chunk_length(self) -> int: + return self._chunk_length + + @property + def split(self) -> str: + return self._split + + @property + def mode(self) -> str: + return self._mode + + @mode.setter + def mode(self, value: str) -> None: + self._mode = value + + @property + def domain_id(self) -> int: + return self._domain_id + + # -- source registration ------------------------------------------------- + + def _register_source( + self, + *, + delta_timestamps: dict[str, list[float]], + tolerance_s: float, + root: str | None = None, + repo_id: str = "local", + force_cache_sync: bool = False, + download_videos: bool = False, + video_backend: str | None = None, + revision: str | None = None, + dataset_label: str | None = None, + prefetched_meta: LeRobotDatasetMetadata | None = None, + ) -> LeRobotDatasetMetadata: + """Register a LeRobot dataset source lazily (metadata-only at init). + + ``prefetched_meta`` lets subclasses load metadata in a thread pool + (``LeRobotDatasetMetadata`` reads are pure I/O — ``info.json`` + + ``episodes.parquet`` + ``tasks.parquet``) and then hand the ready + object to the serial append-path below, which still manages the + order-sensitive shared state (``_datasets`` / ``_dataset_build_args`` + / ``_episode_records`` / ``_episode_cum_ends``). When ``None`` the + caller gets the original single-threaded behavior. + """ + label_str = f" [{dataset_label}]" if dataset_label else "" + cls = self.__class__.__name__ + # "local" is not a valid PEP 440 version, so LeRobot's + # is_valid_version() check skips the get_safe_version() HF API call. + if repo_id == "local" and revision is None: + revision = "local" + + with rss_tracker(f"{cls}{label_str} — metadata load", enabled=self._memprofile): + if prefetched_meta is not None: + meta = prefetched_meta + else: + meta = LeRobotDatasetMetadata( + repo_id=repo_id, + root=root, + revision=revision, + force_cache_sync=force_cache_sync, + ) + ds_idx = len(self._datasets) + self._datasets.append(None) + self._dataset_build_args.append( + { + "repo_id": repo_id, + "root": root, + "delta_timestamps": delta_timestamps, + "tolerance_s": tolerance_s, + "force_cache_sync": force_cache_sync, + "download_videos": download_videos, + "video_backend": video_backend, + "revision": revision, + } + ) + + with rss_tracker( + f"{cls}{label_str} — index records", + enabled=self._memprofile, + extras_fn=lambda: [ + f"episode_records so far: {len(self._episode_records)} entries, " + f"~{_fmt_mb(_deep_size(self._episode_records) / (1024 * 1024))}", + f"episode_cum_ends so far: {len(self._episode_cum_ends)} entries, " + f"~{_fmt_mb(_deep_size(self._episode_cum_ends) / (1024 * 1024))}", + ], + ): + self._append_index_records(meta=meta, ds_idx=ds_idx, dataset_label=dataset_label) + + return meta + + def _append_index_records( + self, + *, + meta: LeRobotDatasetMetadata, + ds_idx: int, + dataset_label: str | None = None, + ) -> None: + """Populate episode split / index records from dataset metadata.""" + episode_ids = split_episode_ids( + total_episodes=meta.total_episodes, + seed=self._split_seed, + val_ratio=self._split_val_ratio, + split=self._split, + ) + + if hasattr(self, "_filter_valid_episodes"): + episode_ids = self._filter_valid_episodes(meta, episode_ids) + episode_spans, valid_count, sample_count = build_episode_spans( + episodes=meta.episodes, + episode_ids=episode_ids, + chunk_length=self._chunk_length, + sample_stride=self._sample_stride, + ) + + class_name = self.__class__.__name__ + label = f" [{dataset_label}]" if dataset_label else "" + log.info(f"{class_name}{label}: split={self._split}, num episodes={len(episode_ids)}") + if sample_count > 0: + log.info( + f"{class_name}{label}: kept {valid_count} / {sample_count} " + f"({100 * valid_count / sample_count:.2f} %) samples" + ) + + for episode_id, sample_start, valid_len in episode_spans: + self._episode_records.append((ds_idx, sample_start, valid_len, episode_id)) + self._num_valid_indices += valid_len + self._episode_cum_ends.append(self._num_valid_indices) + + # -- deferred shard registration ----------------------------------------- + + def _register_sources(self, indices: list[int] | None = None) -> None: + """Register a subset (or all) of the shard roots in ``_all_shard_roots``. + + Called by ``ActionUnifiedIterableDataset.assign_worker`` during training, + or explicitly by eval/visualization scripts after construction. + + ``_all_shard_roots`` is a list of root paths. Per-shard args that are + shared across all shards (``delta_timestamps``, ``tolerance_s``) are + taken from ``self``. Subclasses may override this for extra per-shard + setup (e.g. loading instruction segments). + + When ``enable_fast_init=True``, ``LeRobotDatasetMetadata`` (a pure-IO + read of ``info.json`` + ``episodes.parquet`` + ``tasks.parquet``) is + prefetched in a thread pool and handed to the order-sensitive + serial register loop via ``prefetched_meta=``. Shard count scales + the speedup; for single-shard datasets the two paths are + equivalent. + + Args: + indices: Which entries of ``_all_shard_roots`` to register. + ``None`` means all. + """ + if indices is None: + indices = list(range(len(self._all_shard_roots))) + if not indices: + return + + roots = [self._all_shard_roots[i] for i in indices] + + if self._enable_fast_init: + # ``_ensure_hf_hub_offline`` already ran in ``__init__`` and is + # idempotent; no need to re-invoke here. + workers = max(1, min(self._fast_init_max_workers, len(roots))) + metas: list[LeRobotDatasetMetadata | None] = _parallel_map( + lambda root: LeRobotDatasetMetadata(repo_id="local", root=root, revision="local"), + roots, + max_workers=workers, + label=f"{type(self).__name__}: LeRobotDatasetMetadata prefetch", + ) + else: + metas = [None] * len(roots) + + for root, meta in zip(roots, metas): + label = root.rsplit("/", 1)[-1] if "/" in root else root + self._register_source( + root=root, + delta_timestamps=self._delta_timestamps, + tolerance_s=self._tolerance_s, + dataset_label=label, + prefetched_meta=meta, + ) + + # -- lazy dataset access ------------------------------------------------- + + def _get_dataset(self, ds_idx: int) -> LeRobotDataset: + """Get or lazily construct the LeRobot dataset for the given source index. + + Loaded datasets are tracked with LRU ordering. When the number of + loaded datasets exceeds ``_max_loaded_datasets`` the least-recently-used + dataset is evicted (set back to ``None``) so the GC can reclaim it. + """ + ds = self._datasets[ds_idx] + if ds is not None: + self._loaded_lru.move_to_end(ds_idx) + return ds + + _ensure_hf_hub_offline() + + build_args = self._dataset_build_args[ds_idx] + if build_args is None: + raise RuntimeError(f"Missing dataset build args for dataset index {ds_idx}") + + # Evict least-recently-used datasets before loading a new one. + while len(self._loaded_lru) >= self._max_loaded_datasets: + evict_idx, _ = self._loaded_lru.popitem(last=False) + self._datasets[evict_idx] = None + + with rss_tracker( + f"[WORKER {_os.getpid()}] Lazy-loaded ds[{ds_idx}]", + enabled=self._memprofile, + extras_fn=lambda: [f"total loaded={len(self._loaded_lru)}/{len(self._datasets)}"], + ): + delta_ts = build_args["delta_timestamps"] + if self._skip_video_loading: + # Covers both LeRobot v2 (``observation.images.``) and + # v3 (``observation.image.``) video-column conventions. + delta_ts = {k: v for k, v in delta_ts.items() if not k.startswith("observation.image")} + + log.info(f"Loading shard root={build_args['root']}") + ds = LeRobotDataset( + repo_id=build_args["repo_id"], + root=build_args["root"], + delta_timestamps=delta_ts, + tolerance_s=build_args["tolerance_s"], + force_cache_sync=build_args["force_cache_sync"], + download_videos=build_args["download_videos"], + video_backend=build_args["video_backend"], + revision=build_args["revision"], + episodes=None, + ) + if self._skip_video_loading: + ds.meta.info["features"] = { + k: v for k, v in ds.meta.info["features"].items() if v.get("dtype") != "video" + } + self._datasets[ds_idx] = ds + self._loaded_lru[ds_idx] = None + + return ds + + # -- index resolution ---------------------------------------------------- + + def _resolve_index(self, idx: int) -> tuple[int, int, int, int]: + """Map a flat global index to the source dataset, row, episode, and frame. + + Multiple datasets are concatenated into a single virtual sequence. + Each episode contributes a contiguous *span* of valid frames, and + ``_episode_cum_ends[i]`` stores the running total of valid frames + through the *i*-th span. For example, with two episodes of lengths + 5 and 3 the cum-ends are ``[5, 8]``, so global index 6 falls in the + second span at offset 1. + + The lookup is O(log N) via :func:`bisect_right`. + + Returns: + dataset_idx: Which source dataset this sample belongs to. + row_idx: Row index *within* that dataset's LeRobot table. + episode_id: The episode ID for this sample. + frame_offset: Frame offset from the start of the episode span + (0-based). + + Pure index math -- no I/O or dataset access. Higher-level helpers + like :meth:`_fetch_sample` build on this. + """ + # Support negative indexing (e.g. -1 → last sample). + if idx < 0: + idx += self._num_valid_indices + if idx < 0 or idx >= self._num_valid_indices: + raise IndexError(f"{self.__class__.__name__} index {idx} out of range for size {self._num_valid_indices}") + + # _episode_cum_ends is a monotonically increasing list where entry i + # holds the cumulative number of valid frames up to and including the + # i-th episode span. bisect_right finds the first span whose + # cumulative end is strictly greater than idx, i.e. the span that + # contains idx. + # + # Example: cum_ends = [5, 8, 20] + # idx=0 -> span_idx=0 (first span, frames 0..4) + # idx=4 -> span_idx=0 + # idx=5 -> span_idx=1 (second span, frames 5..7) + # idx=8 -> span_idx=2 (third span, frames 8..19) + span_idx = bisect_right(self._episode_cum_ends, idx) + + # The global index where this span begins is the previous span's + # cumulative end (or 0 for the very first span). The frame_offset + # is how far idx is into this particular episode. + span_start = 0 if span_idx == 0 else self._episode_cum_ends[span_idx - 1] + frame_offset = idx - span_start + + # _episode_records[span_idx] stores (dataset_idx, row_start, valid_len, + # episode_id). row_start is the absolute row in the LeRobot table + # where this episode begins. With sample_stride=k, consecutive + # valid indices map to rows k apart inside the episode, so the + # effective row is row_start + frame_offset * sample_stride. + dataset_idx, row_start, _, episode_id = self._episode_records[span_idx] + row_idx = row_start + frame_offset * self._sample_stride + return dataset_idx, row_idx, episode_id, frame_offset + + def _choose_mode(self) -> str: + """Resolve the active mode for one sample request.""" + if self._mode == "joint": + return random.choice(("forward_dynamics", "inverse_dynamics", "policy")) + return self._mode + + def _fetch_sample(self, idx: int) -> tuple[str, int, int, dict[str, Any]]: + """Resolve index, pick a mode, and load the sample from the dataset. + + Returns ``(mode, dataset_idx, row_idx, sample_dict)``. + """ + mode = self._choose_mode() + dataset_idx, row_idx, _, _ = self._resolve_index(idx) + + self._getitem_count = getattr(self, "_getitem_count", 0) + 1 + profile = self._memprofile and self._getitem_count % 50 == 1 + + with rss_tracker( + f"[WORKER {_os.getpid()}] __getitem__ transient (dataset_idx={dataset_idx})", + enabled=profile, + after_fn=lambda: log_worker_memory_breakdown(self), + ): + sample = self._get_dataset(dataset_idx)[row_idx] + + if self._skip_video_loading: + sample = defaultdict(lambda: None, sample) + + return mode, dataset_idx, row_idx, sample + + # -- action normalization ------------------------------------------------ + + def _normalizer_filename(self) -> str: + """Bundled stats filename for this dataset instance. + + Default convention (matches ``compute_action_stats.py`` output): + ``[_][_].json``. + + Pose/rotation suffixes are appended only when the instance actually + has them (SE(3) pose datasets like Bridge / DROID). Joint-space + datasets — where both are ``None`` — resolve to just + ``.json``. + + Subclasses may override when the bundled filename uses a different + scheme (e.g. UMI's ``uva_umi_single_task_normalizer.json``). + """ + if not self._embodiment_type: + raise RuntimeError( + f"{self.__class__.__name__}: embodiment_type is not set; cannot resolve normalizer filename." + ) + parts = [self._embodiment_type] + if self._pose_convention: + parts.append(self._pose_convention) + if self._rotation_format: + parts.append(self._rotation_format) + return "_".join(parts) + ".json" + + def _normalizer_path(self) -> Path: + """Full path to the bundled stats JSON for this dataset.""" + return self._NORMALIZERS_DIR / self._normalizer_filename() + + def _load_norm_stats(self) -> dict[str, torch.Tensor]: + """Lazy-load action normalization stats (once per worker process). + + Raises :class:`FileNotFoundError` if the stats file is missing. This + is intentional — silently falling back to identity normalization when + the user asked for ``quantile`` / ``quantile_rot`` / ``meanstd`` / + ``minmax`` would be a training bug. + """ + if self._norm_stats is not None: + return self._norm_stats + stats_key = "global_raw" if self._action_normalization == "quantile_rot" else "global" + raw = load_action_stats(str(self._normalizer_path()), stats_key=stats_key) + self._norm_stats = {} + for key, value in raw.items(): + self._norm_stats[key] = torch.from_numpy(value).float() # [D] + return self._norm_stats + + def _normalize_action(self, action: torch.Tensor) -> torch.Tensor: + """Apply the configured normalization, or return the raw action. + + - ``action_normalization=None`` → pass-through (used by viewer / debug) + - ``"quantile"`` → ``2·(x − q01) / (q99 − q01) − 1`` clamped to [-1, 1] + - ``"quantile_rot"`` → same as ``"quantile"``, but using ``global_raw`` + stats so rotation dimensions are normalized too. + - ``"meanstd"`` → ``(x − mean) / std`` + - ``"minmax"`` → ``2·(x − min) / (max − min) − 1`` clamped to [-1, 1] + """ + if self._action_normalization is None: + return action + method = "quantile" if self._action_normalization == "quantile_rot" else self._action_normalization + normalized_action = normalize_action( + action, + method, + self._load_norm_stats(), + ) # [T,D] + return normalized_action + + # -- video formatting ---------------------------------------------------- + + def _convert_video(self, video_tchw: torch.Tensor | None) -> torch.Tensor | None: + """Convert LeRobot ``(T,C,H,W)`` float video to Action ``(C,T,H,W)`` uint8. + + Args: + video_tchw: Raw floating-point video tensor in ``[0, 1]`` with + LeRobot layout, or ``None``. # [T,C,H,W] | None + + Returns: + Action-formatted video tensor, or ``None``. # [C,T,H,W] | None + """ + if self._skip_video_loading or video_tchw is None: + return None + if video_tchw.ndim != 4: + raise ValueError( + f"{self.__class__.__name__}._convert_video expected video with shape [T,C,H,W], " + f"got ndim={video_tchw.ndim}" + ) + if not torch.is_floating_point(video_tchw): + raise TypeError( + f"{self.__class__.__name__}._convert_video expected floating-point video in [0, 1], " + f"got dtype={video_tchw.dtype}" + ) + video_min = video_tchw.amin() # [] + video_max = video_tchw.amax() # [] + if video_min.item() < 0.0 or video_max.item() > 1.0: + raise ValueError( + f"{self.__class__.__name__}._convert_video expected floating-point video in [0, 1], " + f"got range=[{video_min.item():.6f}, {video_max.item():.6f}]" + ) + formatted_video = (video_tchw * 255.0).clamp(0.0, 255.0).to(torch.uint8).permute(1, 0, 2, 3) # [C,T,H,W] + return formatted_video + + # -- result building ----------------------------------------------------- + + def _build_action_spec(self) -> ActionSpec | None: + """Subclass override: declare this dataset's action layout. + + Called once per instance — the result is cached by ``self.action_spec``. + Return ``None`` to skip spec-driven idle detection; in that case + ``_compute_idle_frames`` will log a one-time warning and return + ``None`` for every sample. + """ + return None + + @cached_property + def action_spec(self) -> ActionSpec | None: + """Cached :class:`ActionSpec` from ``_build_action_spec``. + + Returns ``None`` when the subclass did not declare one; idle detection + is then skipped (with a one-time warning) until the subclass overrides + ``_build_action_spec``. + """ + return self._build_action_spec() + + @cached_property + def action_names(self) -> list[str] | None: + spec = self.action_spec + return spec.names if spec is not None else None + + # Idle-detection thresholds. Defined as **velocities** (per second) so the + # same numeric value means the same physical motion across datasets with + # different sampling rates; converted to per-frame at call time using + # ``self._fps`` via :meth:`_resolve_idle_thresholds`. + # + # Defaults: + # - ``idle_eps_t_per_sec`` = 5 mm/s (≈ 1 mm/frame at 5 Hz) + # - ``idle_eps_r_per_sec`` = 1.5°/s (geodesic, rotation-format aware) + # - ``idle_eps_g`` = 1e-2 unit gripper Δ (no fps) + # - ``idle_joint_threshold_per_sec`` = 5e-3 rad/s + # - ``idle_min_streak`` = 3 require ≥ 3 consecutive + # + # Subclasses can either override the ``*_per_sec`` attributes (preferred — + # keeps the velocity semantics) or set the corresponding ``idle_eps_*`` / + # ``idle_joint_threshold`` attribute to a non-``None`` value to bypass the + # per-fps conversion entirely (raw per-frame override). + idle_eps_t_per_sec: float = 5e-3 + idle_eps_r_per_sec: float = math.radians(1.5) + idle_eps_g: float = 1e-2 + idle_joint_threshold_per_sec: float = 5e-3 + idle_min_streak: int = 3 + + # Optional per-frame overrides. ``None`` (default) → use the ``*_per_sec`` + # attribute / fps conversion above. + idle_eps_t: float | None = None + idle_eps_r: float | None = None + idle_joint_threshold: float | None = None + + def _resolve_idle_thresholds(self) -> tuple[float, float, float, float]: + """Resolve per-frame idle thresholds for this dataset instance. + + Returns ``(eps_t, eps_r, eps_g, joint_threshold)`` in raw per-frame + units. Honours direct per-frame overrides if the subclass sets the + non-``_per_sec`` attribute; otherwise scales the ``_per_sec`` values + by ``self._fps``. + """ + fps = float(self._fps) if self._fps else 1.0 + eps_t = self.idle_eps_t if self.idle_eps_t is not None else self.idle_eps_t_per_sec / fps + eps_r = self.idle_eps_r if self.idle_eps_r is not None else self.idle_eps_r_per_sec / fps + joint_thr = ( + self.idle_joint_threshold + if self.idle_joint_threshold is not None + else self.idle_joint_threshold_per_sec / fps + ) + return float(eps_t), float(eps_r), float(self.idle_eps_g), float(joint_thr) + + def _compute_idle_frames(self, raw_action: torch.Tensor) -> torch.Tensor | None: + """Count idle frames in the *raw* (un-normalized) action chunk. + + Requires ``self.action_spec`` to be declared via ``_build_action_spec``. + Returns ``None`` when: + - ``pose_convention`` is not ``"backward_framewise"`` (TODO: extend), + - the subclass has not declared an ``ActionSpec`` (logs a one-time warning), + - the action layout does not match the declared spec. + + Detection thresholds come from the ``idle_eps_*`` class attributes + (overridable per dataset). Subclasses can also override this method + outright, or pass an explicit ``idle_frames`` integer via + ``**extras`` to :meth:`_build_result`. + """ + + # conventions (anchored / absolute) need different idle semantics. + if self._pose_convention != "backward_framewise": + if not getattr(self, "_warned_pose_convention", False): + log.warning( + f"Dataset {self.__class__.__name__}: pose_convention=" + f"{self._pose_convention!r} is not 'backward_framewise'; " + "skipping idle-frames detection. Centralize the dataset " + "to backward_framewise to enable IdleFrames captioning." + ) + self._warned_pose_convention = True + return None + + spec = self.action_spec + if spec is None: + if not getattr(self, "_warned_no_action_spec", False): + log.warning( + f"Dataset {self.__class__.__name__} has no action spec defined; " + "skipping idle-frames detection. Override _build_action_spec() to enable it." + ) + self._warned_no_action_spec = True + return None + + eps_t, eps_r, eps_g, joint_thr = self._resolve_idle_thresholds() + try: + n = compute_idle_frames( + raw_action, + spec, + eps_t=eps_t, + eps_r=eps_r, + eps_g=eps_g, + joint_threshold=joint_thr, + min_streak=self.idle_min_streak, + ) + except (ValueError, TypeError) as e: + if not getattr(self, "_warned_action_layout", False): + log.warning( + f"Dataset {self.__class__.__name__}: action layout does " + f"not match the declared ActionSpec " + f"(action_dim={int(raw_action.shape[-1])}, " + f"spec.dim={spec.dim}); skipping idle-frames detection. " + f"Underlying error: {e}" + ) + self._warned_action_layout = True + return None + return torch.tensor(n, dtype=torch.long) + + def _build_result( + self, + *, + mode: str, + video: torch.Tensor | None, + action: torch.Tensor, + ai_caption: str, + **extras: Any, + ) -> dict[str, Any]: + """Assemble the common return dict for ``__getitem__``. + + ``video`` is expected in raw LeRobot layout before final formatting. + Subclasses may pass extra keys (e.g. ``initial_pose``) via ``**extras``. + ``idle_frames`` is auto-computed from the raw (un-normalized) ``action`` + whenever the dataset's pose/rotation conventions allow it; subclasses + can override by passing ``idle_frames`` (int or scalar tensor) via + ``**extras``. + """ + # Compute idle_frames from the raw action before normalization, unless + # the subclass has provided one explicitly via ``**extras``. + if "idle_frames" not in extras: + idle_frames = self._compute_idle_frames(action) + if idle_frames is not None: + extras = {"idle_frames": idle_frames, **extras} + + normalized_action = self._normalize_action(action) # [T,D] + if self._skip_video_loading: + result: dict[str, Any] = {"action": normalized_action} + if "idle_frames" in extras: + result["idle_frames"] = extras["idle_frames"] + return result + formatted_video = self._convert_video(video) # [C,T,H,W] | None + return { + "ai_caption": ai_caption, + "video": formatted_video, + "action": normalized_action, + "conditioning_fps": torch.tensor(self._fps, dtype=torch.long), + "mode": mode, + "domain_id": torch.tensor(self._domain_id, dtype=torch.long), + "viewpoint": self._viewpoint, + **extras, + } + + def __len__(self) -> int: + return self._num_valid_indices diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/domain_utils.py b/cosmos-framework/cosmos_framework/data/vfm/action/domain_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bd1f47e7e70a8e429d4572c69c122a36b0e37e56 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/domain_utils.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Domain ID helpers for cross-embodiment action datasets.""" + +EMBODIMENT_TO_DOMAIN_ID: dict[str, int] = { + "no_action": 0, + "av": 1, + "camera_pose": 2, + "hand_pose": 3, + "pusht": 4, + "libero": 5, + "umi": 6, + "bridge_orig_lerobot": 7, + "droid_lerobot": 8, + "robomind-franka": 8, # Both Droid and RoboMIND-Franka are using robotiq and franka + "embodiment_b": 9, + "robomind-franka-dual": 12, + "robomind-ur": 13, + "fractal": 20, +} + + +def get_domain_id(embodiment_type: str) -> int: + """Get the domain ID for a given embodiment type.""" + key = embodiment_type.lower().strip() + if key not in EMBODIMENT_TO_DOMAIN_ID: + raise KeyError( + f"Unknown embodiment type: {embodiment_type!r}. " + f"Available embodiments: {sorted(EMBODIMENT_TO_DOMAIN_ID.keys())}" + ) + return EMBODIMENT_TO_DOMAIN_ID[key] diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset.py b/cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..4fbaadf3b9d4928c415e4fdd018e1c0948b8e656 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset.py @@ -0,0 +1,483 @@ +import json +import os +import random +from typing import Any, cast + +import numpy as np +import torch +import torch.nn.functional as F +import torchvision.transforms.v2 as T +from scipy.spatial.transform import Rotation as R + +from cosmos_framework.utils import log +from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import ( + ActionNormalization, + ActionSpec, + BaseActionLeRobotDataset, + Gripper, + Joint, + Pos, + Rot, + build_action_spec, + build_episode_spans, + split_episode_ids, +) +from cosmos_framework.data.vfm.action.droid_lerobot_dataset_config import ( + _GRIPPER_STATE_FEATURE, + _JOINT_ACTION_FEATURE, + _JOINT_STATE_FEATURE, + ACTION_FEATURES, + HAS_MULTI_LANGUAGE_ANNOTATIONS, + IMAGE_FEATURES, + IS_FLAT_ACTION, + IS_GRIPPER_ACTION_FLIPPED, + LEROBOT_ROOTS, + STATE_FEATURES, +) +from cosmos_framework.data.vfm.action.pose_utils import ( + PoseConvention, + build_abs_pose_from_components, + convert_rotation, + pose_abs_to_rel, +) +from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint + +_FILTER_DICT_PATH = "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/users/ychao/datasets/raw/KarlP-droid/keep_ranges_1_0_1.json" + +# 90-degree clockwise rotation about the Z axis (in local frame), converting +# DROID Franka panda_link8 orientation to the OpenCV camera convention. +_DROID_TO_OPENCV: np.ndarray = np.array( + [ + [0.0, -1.0, 0.0], + [1.0, 0.0, 0.0], + [0.0, 0.0, 1.0], + ], + dtype=np.float32, +) + + +class DROIDLeRobotDataset(BaseActionLeRobotDataset): + """ """ + + def __init__( + self, + root: str = "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/droid_plus_lerobot_640x360_20260412", + fps: float = 15.0, + chunk_length: int = 16, + split_seed: int = 42, + split_val_ratio: float = 0.03, + split: str = "train", + mode: str = "policy", + pose_convention: PoseConvention = "backward_framewise", + action_normalization: ActionNormalization | None = None, + tolerance_s=2e-4, + viewpoint: Viewpoint = "concat_view", + use_success_only: bool = False, + video_mode: str | None = None, # TODO (ychao): remove + action_space: str = "midtrain", # TODO (ychao): remove + use_state: bool = False, + use_filter_dict: bool = False, + enable_fast_init: bool = False, + max_num_history_actions: int = 0, + use_image_augmentation: bool = False, + ) -> None: + """ """ + super().__init__( + fps=fps, + chunk_length=chunk_length, + split_seed=split_seed, + split_val_ratio=split_val_ratio, + split=split, + mode=mode, + embodiment_type="droid_lerobot", + viewpoint=viewpoint, + pose_convention=pose_convention, + rotation_format="rot6d", + action_normalization=action_normalization, + tolerance_s=tolerance_s, + enable_fast_init=enable_fast_init, + ) + self._use_success_only = use_success_only + self._video_mode = video_mode + self._action_space = action_space + self._use_state = use_state + self._use_filter_dict = use_filter_dict + self._max_num_history_actions = max_num_history_actions + self._use_image_augmentation = use_image_augmentation + if max_num_history_actions > 0 and action_space not in ("midtrain", "joint_pos"): + raise ValueError( + f"max_num_history_actions is only supported with action_space='midtrain' or 'joint_pos', got {action_space!r}" + ) + + self._is_val_temp_seg = split == "val_temp_seg" + self._to_opencv = _DROID_TO_OPENCV + + version = os.path.basename(root) + try: + lerobot_roots = LEROBOT_ROOTS[version] + self._image_features = IMAGE_FEATURES[version] + self._state_features = STATE_FEATURES[version] + self._action_features = ACTION_FEATURES[version] + self._is_flat_action = IS_FLAT_ACTION[version] + self._has_multi_language_annotations = HAS_MULTI_LANGUAGE_ANNOTATIONS[version] + self._is_gripper_action_flipped = IS_GRIPPER_ACTION_FLIPPED[version] + except KeyError as e: + raise ValueError(f"Unknown version: {version!r}. Supported: {list(LEROBOT_ROOTS.keys())}") from e + + if self._use_success_only and lerobot_roots: + lerobot_roots = [x for x in lerobot_roots if x.split("/", 1)[0] == "success"] + + self._all_shard_roots = [os.path.join(root, x) for x in lerobot_roots] if lerobot_roots else [root] + + observation_ts = [i * self._dt for i in range(0, self._chunk_length + 1)] + action_ts = [i * self._dt for i in range(0, self._chunk_length)] + if self._max_num_history_actions > 0 and self._action_space in ("midtrain", "joint_pos"): + observation_ts_ext = [i * self._dt for i in range(-self._max_num_history_actions, self._chunk_length + 1)] + action_ts_ext = [i * self._dt for i in range(-self._max_num_history_actions, self._chunk_length)] + else: + observation_ts_ext = observation_ts + action_ts_ext = action_ts + self._delta_timestamps: dict[str, list[float]] = { + self._state_features: observation_ts_ext, + self._action_features: action_ts_ext, + } + if self._viewpoint in ("wrist_view", "concat_view"): + self._delta_timestamps[self._image_features["wrist"]] = observation_ts + if self._viewpoint in ("third_person_view", "concat_view"): + self._delta_timestamps[self._image_features["left"]] = observation_ts + self._delta_timestamps[self._image_features["right"]] = observation_ts + if self._action_space == "joint_pos": + self._delta_timestamps[_JOINT_ACTION_FEATURE] = action_ts + if self._use_state or self._max_num_history_actions > 0: + self._delta_timestamps[_JOINT_STATE_FEATURE] = observation_ts_ext + self._delta_timestamps[_GRIPPER_STATE_FEATURE] = observation_ts_ext + if self._use_state and self._action_space != "joint_pos": + self._delta_timestamps[_GRIPPER_STATE_FEATURE] = observation_ts + + if self._use_filter_dict: + with open(_FILTER_DICT_PATH) as f: + self._filter_dict = json.load(f) + + self._image_augmentor: T.Compose | None = None + + def _append_index_records(self, *, meta, ds_idx: int, dataset_label: str | None = None) -> None: + """ """ + if not self._use_filter_dict: + super()._append_index_records(meta=meta, ds_idx=ds_idx, dataset_label=dataset_label) + return + + episode_ids = split_episode_ids( + total_episodes=meta.total_episodes, + seed=self._split_seed, + val_ratio=self._split_val_ratio, + split=self._split, + ) + episode_spans, _, sample_count = build_episode_spans( + meta.episodes, episode_ids, self._chunk_length, sample_stride=self._sample_stride + ) + + class_name = self.__class__.__name__ + label = f" [{dataset_label}]" + + log.info(f"{class_name}{label}: split={self._split}, num episodes={len(episode_ids)}") + + filtered_count = 0 + for episode_id, sample_start, valid_len in episode_spans: + ep_id_str = meta.episodes[episode_id]["episode_id"] + episode_key = f"gs://xembodiment_data/r2d2/r2d2-data-full/{ep_id_str}/recordings/MP4--gs://xembodiment_data/r2d2/r2d2-data-full/{ep_id_str}/trajectory.h5" + ranges = self._filter_dict.get(episode_key) + if ranges is None: + continue + for s, e in ranges: + sub_start = max(s, 0) + sub_end = min(e - self._chunk_length, valid_len) + sub_valid_len = max(0, sub_end - sub_start) + if sub_valid_len > 0: + self._episode_records.append((ds_idx, sample_start + sub_start, sub_valid_len, episode_id)) + self._num_valid_indices += sub_valid_len + self._episode_cum_ends.append(self._num_valid_indices) + filtered_count += sub_valid_len + + if sample_count > 0: + log.info( + f"{class_name}{label}: kept {filtered_count} / {sample_count} ({100.0 * filtered_count / sample_count:.2f} %) samples" + ) + + def _register_sources(self, indices: list[int] | None = None) -> None: + """ """ + super()._register_sources(indices) + if self._is_val_temp_seg: + self._apply_temp_seg_filter() + + def _apply_temp_seg_filter(self) -> None: + """Replace index records with one high-scoring segment per episode. + + A segment is interesting if either: + - The gripper action changes significantly (open/close transition), or + - The gripper is closed and the end-effector position is moving. + Among qualifying segments the one with the highest score is kept. + """ + ds = self._get_dataset(0) + chunk_size = self._chunk_length + 1 + gripper_change_threshold = 0.5 + ee_movement_threshold = 0.01 + + new_records: list[tuple[int, int, int, int]] = [] + num_episodes = len(self._episode_records) + + for ds_idx, sample_start, valid_len, episode_id in self._episode_records: + end = sample_start + valid_len + self._chunk_length + num_candidates = valid_len + if num_candidates <= 0: + continue + + episode_data = ds.hf_dataset[sample_start:end] + actions = torch.tensor(np.array(episode_data[self._action_features])) # [N,action_dim] + states = torch.tensor(np.array(episode_data[self._state_features])) # [N,state_dim] + + gripper_action = actions[:, 6] if self._is_flat_action else actions # [N] + ee_pos = states[:, :3] # [N,3] + ee_disp = (ee_pos[1:] - ee_pos[:-1]).norm(dim=-1) # [N-1] + + ee_disp_windows = ee_disp.unfold(0, self._chunk_length, 1) # [num_candidates,chunk_length] + gripper_windows = gripper_action.unfold(0, chunk_size, 1) # [num_candidates,chunk_size] + + gripper_range = gripper_windows.max(dim=1).values - gripper_windows.min(dim=1).values # [num_candidates] + total_ee_movement = ee_disp_windows.sum(dim=1) # [num_candidates] + gripper_closed_ratio = (gripper_windows < 0.5).float().mean(dim=1) # [num_candidates] + + has_gripper_change = gripper_range > gripper_change_threshold + gripper_closed = gripper_closed_ratio > 0.5 + has_ee_movement = total_ee_movement > ee_movement_threshold + + scores = torch.zeros(num_candidates) # [num_candidates] + scores[has_gripper_change] = 0.5 + gripper_range[has_gripper_change] + total_ee_movement[has_gripper_change] + + closed_and_moving = gripper_closed & ~has_gripper_change & has_ee_movement + scores[closed_and_moving] = 1.0 + total_ee_movement[closed_and_moving] + + if scores.max().item() > 0: + best_offset = int(scores.argmax().item()) + new_records.append((ds_idx, sample_start + best_offset, 1, episode_id)) + + self._episode_records = new_records + self._num_valid_indices = len(new_records) + self._episode_cum_ends = list(range(1, len(new_records) + 1)) + + log.info(f"DROIDLeRobotDataset: val_temp_seg kept {len(new_records)} segments from {num_episodes} episodes") + + def _compose_multi_view(self, sample: dict[str, Any]) -> torch.Tensor: + """Compose wrist, left, and right views into a single frame. + + Layout (per frame): + ┌──────────────┐ + │ wrist │ (H, W) + ├───────┬──────┤ + │ left │ right│ (H/2, W/2) each + └───────┴──────┘ + + Left and right exterior cameras are downscaled by 2x so that they + tile to the same width as the wrist view. The output height is 3H/2. + + Returns: + Composited raw video tensor in ``(T,C,H_out,W)`` float format. + """ + wrist = sample[self._image_features["wrist"]] # [T,C,H,W] + left = sample[self._image_features["left"]] # [T,C,H_l,W_l] + right = sample[self._image_features["right"]] # [T,C,H_r,W_r] + + if self._use_image_augmentation: + if self._image_augmentor is None: + _, _, h, w = wrist.shape + self._image_augmentor = T.Compose( + [ + T.RandomCrop((int(h * 0.95), int(w * 0.95))), + T.Resize((h, w), antialias=True), + T.ColorJitter(brightness=0.3, contrast=0.4, saturation=0.5, hue=0.08), + ] + ) + n, m = wrist.shape[0], wrist.shape[0] + left.shape[0] + combined = self._image_augmentor(torch.cat([wrist, left, right], dim=0)) + wrist, left, right = combined[:n], combined[n:m], combined[m:] + + _, _, h_w, w_w = wrist.shape + half_h, half_w = h_w // 2, w_w // 2 + + left = F.interpolate(left, size=(half_h, half_w), mode="bilinear", align_corners=False) # [T,C,H/2,W/2] + right = F.interpolate(right, size=(half_h, half_w), mode="bilinear", align_corners=False) # [T,C,H/2,W/2] + bottom = torch.cat([left, right], dim=-1) # [T,C,H/2,W] + + composite = torch.cat([wrist, bottom], dim=-2) # [T,C,3H/2,W] + return composite # [T,C,3H/2,W] + + def _build_action_spec(self) -> ActionSpec: + """DROID: 10D ``[Pos, Rot6d, Gripper]`` for ``ee_pose``, + 8D ``[Joint(7), Gripper]`` for ``joint_pos``. + """ + if self._action_space == "joint_pos": + return build_action_spec(Joint(n=7, label="joint"), Gripper()) + return build_action_spec(Pos(), Rot("rot6d"), Gripper()) + + def __getitem__(self, idx: int) -> dict[str, Any]: + """ """ + mode, _, _, sample = self._fetch_sample(idx) + + if self._has_multi_language_annotations: + tasks = sample["task"].split(" | ") + ai_caption = random.choice(tasks) + else: + ai_caption = sample["task"] + + if self._skip_video_loading: + video = None + elif self._video_mode is None: + if self._viewpoint == "concat_view": + video = self._compose_multi_view(sample) + else: + video = sample[self._image_features["wrist"]] # [T,C,H,W] + else: + if self._video_mode == "wrist": + video = sample[self._image_features["wrist"]] + if self._video_mode in ("rand_exterior", "wrist_rand_exterior"): + exterior_key = random.choice([self._image_features["left"], self._image_features["right"]]) + if self._video_mode == "rand_exterior": + video = sample[exterior_key] + else: + video = torch.cat([sample[self._image_features["wrist"]], sample[exterior_key]], dim=2) + if self._video_mode in ("wrist_left_exterior", "wrist_both_exterior"): + wrist = sample[self._image_features["wrist"]] + half_h, half_w = wrist.shape[2] // 2, wrist.shape[3] // 2 + left = F.interpolate( + sample[self._image_features["left"]], size=(half_h, half_w), mode="bilinear", align_corners=False + ) + if self._video_mode == "wrist_left_exterior": + right = torch.zeros_like(left) + if self._video_mode == "wrist_both_exterior": + right = F.interpolate( + sample[self._image_features["right"]], + size=(half_h, half_w), + mode="bilinear", + align_corners=False, + ) + video = torch.cat([wrist, torch.cat([left, right], dim=-1)], dim=-2) + + extras: dict[str, Any] = {} + + if self._action_space == "midtrain": + pose_convention = cast(PoseConvention, self._pose_convention) + state = sample[self._state_features] # [T+1, state_dim] or [H+T+1, state_dim] + poses_abs = build_abs_pose_from_components(state[:, 0:3], state[:, 3:6], "euler_xyz") + poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ self._to_opencv + initial_pose = torch.from_numpy(poses_abs[-self._chunk_length - 1].copy()).float() + poses_rel = pose_abs_to_rel(poses_abs, rotation_format="rot6d", pose_convention=pose_convention) + gripper = ( + sample[self._action_features][:, [6]] + if self._is_flat_action + else sample[self._action_features].unsqueeze(-1) + ) + if self._is_gripper_action_flipped: + gripper = 1.0 - gripper + action = torch.from_numpy( + np.concatenate([poses_rel[-self._chunk_length :], gripper[-self._chunk_length :]], axis=-1) + ).float() # [T,10] + extras["initial_pose"] = initial_pose + if self._max_num_history_actions > 0: + _, _, _, frame_offset = self._resolve_index(int(idx)) + num_available = min(self._max_num_history_actions, frame_offset * self._sample_stride) + actual_h = num_available + # with 0.5 probability, randomly sample the number of history frames + if random.random() < 0.5: + actual_h = random.randint(0, num_available) + if actual_h > 0: + hist_action_raw = torch.from_numpy( + np.concatenate( + [ + poses_rel[-self._chunk_length - actual_h : -self._chunk_length], + gripper[-self._chunk_length - actual_h : -self._chunk_length], + ], + axis=-1, + ) + ).float() + extras["history_action"] = self._normalize_action(hist_action_raw) + if self._use_state: + initial_gripper = sample[_GRIPPER_STATE_FEATURE][0].unsqueeze(-1) + if self._is_gripper_action_flipped: + initial_gripper = 1.0 - initial_gripper + initial_rot6d = convert_rotation(poses_abs[-self._chunk_length - 1, :3, :3], "matrix", "rot6d") + initial_state = torch.from_numpy( + np.concatenate((poses_abs[-self._chunk_length - 1, :3, 3], initial_rot6d, initial_gripper), axis=-1) + ).float() + action = torch.cat([initial_state.unsqueeze(0), action], dim=0) + if self._action_space == "ee_pose_delta": + state = sample[self._state_features] + pose = np.tile(np.eye(4), (state.shape[0], 1, 1)) + pose[:, :3, :3] = R.from_euler("xyz", state[:, 3:6]).as_matrix() + pose[:, :3, 3] = state[:, 0:3] + pose_delta = np.linalg.inv(pose[0]) @ pose[1:] + gripper = sample[self._action_features].unsqueeze(-1) + if self._is_gripper_action_flipped: + gripper = 1.0 - gripper + action = torch.from_numpy( + np.concatenate((pose_delta[:, :3, 3], pose_delta[:, :3, 0], pose_delta[:, :3, 1], gripper), axis=-1) + ).float() + if self._use_state: + initial_gripper = sample[_GRIPPER_STATE_FEATURE][0].unsqueeze(-1) + if self._is_gripper_action_flipped: + initial_gripper = 1.0 - initial_gripper + initial_state = torch.from_numpy( + np.concatenate((pose[0, :3, 3], pose[0, :3, 0], pose[0, :3, 1], initial_gripper), axis=-1) + ).float() + action = torch.cat([initial_state.unsqueeze(0), action], dim=0) + if self._action_space == "joint_pos": + gripper = sample[self._action_features][-self._chunk_length :].unsqueeze(-1) + if self._is_gripper_action_flipped: + gripper = 1.0 - gripper + action = torch.cat((sample[_JOINT_ACTION_FEATURE], gripper), dim=-1).float() + if self._max_num_history_actions > 0: + _, _, _, frame_offset = self._resolve_index(int(idx)) + num_available = min(self._max_num_history_actions, frame_offset * self._sample_stride) + actual_h = num_available + if random.random() < 0.5: + actual_h = random.randint(0, num_available) + if actual_h > 0: + hist_joint = sample[_JOINT_STATE_FEATURE][ + -self._chunk_length - 1 - actual_h : -self._chunk_length - 1 + ] + hist_gripper = sample[_GRIPPER_STATE_FEATURE][ + -self._chunk_length - 1 - actual_h : -self._chunk_length - 1 + ].unsqueeze(-1) + if self._is_gripper_action_flipped: + hist_gripper = 1.0 - hist_gripper + hist_action_raw = torch.cat((hist_joint, hist_gripper), dim=-1).float() + extras["history_action"] = self._normalize_action(hist_action_raw) + if self._use_state: + initial_gripper = sample[_GRIPPER_STATE_FEATURE][-self._chunk_length - 1].unsqueeze(-1) + if self._is_gripper_action_flipped: + initial_gripper = 1.0 - initial_gripper + initial_state = torch.cat( + (sample[_JOINT_STATE_FEATURE][-self._chunk_length - 1], initial_gripper), dim=-1 + ).float() + action = torch.cat([initial_state.unsqueeze(0), action], dim=0) + + if self._viewpoint == "concat_view" and self._video_mode in ( + None, + "wrist_left_exterior", + "wrist_both_exterior", + ): + extras["additional_view_description"] = ( + "The top row is from the wrist-mounted camera. " + "The bottom row contains two horizontally concatenated third-person perspective views of the scene from opposite sides, with the robot visible." + ) + + return self._build_result( + mode=mode, + video=video, + action=action, + ai_caption=ai_caption, + **extras, + ) + + @property + def action_dim(self) -> int: + """ """ + return 8 if self._action_space == "joint_pos" else 10 diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset_config.py b/cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset_config.py new file mode 100644 index 0000000000000000000000000000000000000000..a2257cd5e6a863a36391e8d71b1fab5190eadc7b --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/droid_lerobot_dataset_config.py @@ -0,0 +1,97 @@ +_INSTITUTIONS = [ + "AUTOLab", + "CLVR", + "GuptaLab", + "ILIAD", + "IPRL", + "IRIS", + "PennPAL", + "RAD", + "RAIL", + "REAL", + "RPL", + "TRI", + "WEIRD", +] + +LEROBOT_ROOTS = { + "droid_lerobot_20260115_no_noops": None, + "droid_plus_lerobot_320x180_20260406_sharded": [f"success/{x}" for x in _INSTITUTIONS] + + [f"failure/{x}" for x in _INSTITUTIONS], + "droid_plus_lerobot_320x180_20260406": ["success", "failure"], + "droid_plus_lerobot_640x360_20260412_sharded": [f"success/{x}" for x in _INSTITUTIONS] + + [f"failure/{x}" for x in _INSTITUTIONS], + "droid_plus_lerobot_640x360_20260412": ["success", "failure"], +} + +IMAGE_FEATURES = { + "droid_lerobot_20260115_no_noops": { + "wrist": "observation.images.wrist_image_left", + "left": "observation.images.exterior_image_1_left", + "right": "observation.images.exterior_image_2_left", + }, + "droid_plus_lerobot_320x180_20260406_sharded": { + "wrist": "observation.image.wrist_image_left", + "left": "observation.image.exterior_image_1_left", + "right": "observation.image.exterior_image_2_left", + }, + "droid_plus_lerobot_320x180_20260406": { + "wrist": "observation.image.wrist_image_left", + "left": "observation.image.exterior_image_1_left", + "right": "observation.image.exterior_image_2_left", + }, + "droid_plus_lerobot_640x360_20260412_sharded": { + "wrist": "observation.image.wrist_image_left", + "left": "observation.image.exterior_image_1_left", + "right": "observation.image.exterior_image_2_left", + }, + "droid_plus_lerobot_640x360_20260412": { + "wrist": "observation.image.wrist_image_left", + "left": "observation.image.exterior_image_1_left", + "right": "observation.image.exterior_image_2_left", + }, +} + +STATE_FEATURES = { + "droid_lerobot_20260115_no_noops": "observation.state", + "droid_plus_lerobot_320x180_20260406_sharded": "observation.state.cartesian_position", + "droid_plus_lerobot_320x180_20260406": "observation.state.cartesian_position", + "droid_plus_lerobot_640x360_20260412_sharded": "observation.state.cartesian_position", + "droid_plus_lerobot_640x360_20260412": "observation.state.cartesian_position", +} + +ACTION_FEATURES = { + "droid_lerobot_20260115_no_noops": "action", + "droid_plus_lerobot_320x180_20260406_sharded": "action.gripper_position", + "droid_plus_lerobot_320x180_20260406": "action.gripper_position", + "droid_plus_lerobot_640x360_20260412_sharded": "action.gripper_position", + "droid_plus_lerobot_640x360_20260412": "action.gripper_position", +} + +IS_FLAT_ACTION = { + "droid_lerobot_20260115_no_noops": True, + "droid_plus_lerobot_320x180_20260406_sharded": False, + "droid_plus_lerobot_320x180_20260406": False, + "droid_plus_lerobot_640x360_20260412_sharded": False, + "droid_plus_lerobot_640x360_20260412": False, +} + +HAS_MULTI_LANGUAGE_ANNOTATIONS = { + "droid_lerobot_20260115_no_noops": False, + "droid_plus_lerobot_320x180_20260406_sharded": True, + "droid_plus_lerobot_320x180_20260406": True, + "droid_plus_lerobot_640x360_20260412_sharded": True, + "droid_plus_lerobot_640x360_20260412": True, +} + +IS_GRIPPER_ACTION_FLIPPED = { + "droid_lerobot_20260115_no_noops": False, + "droid_plus_lerobot_320x180_20260406_sharded": True, + "droid_plus_lerobot_320x180_20260406": True, + "droid_plus_lerobot_640x360_20260412_sharded": True, + "droid_plus_lerobot_640x360_20260412": True, +} + +_JOINT_ACTION_FEATURE = "action.joint_position" +_JOINT_STATE_FEATURE = "observation.state.joint_positions" +_GRIPPER_STATE_FEATURE = "observation.state.gripper_position" diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/fractal.py b/cosmos-framework/cosmos_framework/data/vfm/action/fractal.py new file mode 100644 index 0000000000000000000000000000000000000000..cfb6ce0565158ffbc3174a7b9c229481c87471bf --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/fractal.py @@ -0,0 +1,192 @@ +# Fractal (fractal20220817_data) — Google Robot RT-1 dataset +# LeRobot v2.0 format from IPEC-COMMUNITY/fractal20220817_data_lerobot +# +# Robot: google_robot +# 87,212 episodes, 3,786,400 frames, 599 tasks, fps=3 +# state: [x, y, z, rx, ry, rz, rw, gripper] (8D, quaternion) +# action: [x, y, z, roll, pitch, yaw, gripper] (7D, delta) +# video: observation.images.image (256×320) + +from typing import Any, cast + +import numpy as np +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata + +from cosmos_framework.utils import log +from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import ( + ActionNormalization, + ActionSpec, + BaseActionLeRobotDataset, + Gripper, + Pos, + Rot, + build_action_spec, +) +from cosmos_framework.data.vfm.action.pose_utils import ( + PoseConvention, + build_abs_pose_from_components, + pose_abs_to_rel, +) +from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint + +_VALID_POSE_CONVENTIONS = ("backward_anchored", "backward_framewise") +# These episodes contain base motion, which breaks the fixed-base Google Robot +# action assumption used by training and the viewer. +_SKIPPED_EPISODE_IDS: frozenset[int] = frozenset({29, 189, 382}) + +# Google Robot raw EE frame has x/y axes rotated ~90° around z compared to +# OpenCV convention. Rz(-90°) as a right-multiply corrects this: +# new_x = -old_y (rightward), new_y = old_x (downward), z unchanged (approach). +_GOOGLE_ROBOT_TO_OPENCV = np.array([[0, 1, 0], [-1, 0, 0], [0, 0, 1]], dtype=np.float32) + +# --------------------------------------------------------------------------- +# TCP → flange (gripper body) offset +# --------------------------------------------------------------------------- +# The fractal dataset records EE poses at ``link_gripper_tcp`` — a calibrated +# tool-center-point 164 mm past the gripper body (``link_gripper``), roughly +# at the fingertip. For action learning we re-reference poses to the +# *gripper body* (``link_gripper``) because: +# 1. It is the last actuated link — its pose is fully determined by joint +# angles, whereas the TCP has a tiny calibration-dependent tilt (~0.25°). +# 2. Gripper body is a more natural frame for grasping tasks: the position +# is at the wrist, not at the fragile fingertip. +# 3. The ~10 cm offset reduces the lever-arm effect of small rotation +# errors on position accuracy. +# +# The constant below is the SE(3) transform from ``link_gripper_tcp`` to +# ``link_gripper``, computed from the SimplerEnv URDF via pinocchio FK at the +# neutral configuration: +# T = oMf[link_gripper_tcp]⁻¹ · oMf[link_gripper] +# +# Source URDF: https://github.com/simpler-env/ManiSkill2_real2sim +# → mani_skill2_real2sim/assets/descriptions/google_robot_description/ +# fmt: off +_TCP_TO_FLANGE = np.array([ + [+0.9999897671, -0.0008686425, +0.0044397163, -0.0050618476], + [+0.0008745501, +0.9999987346, -0.0013288658, -0.0016717725], + [-0.0044385564, +0.0013327349, +0.9999892615, -0.1635144743], + [+0.0000000000, +0.0000000000, +0.0000000000, +1.0000000000], +], dtype=np.float32) +# fmt: on + + +class FractalLeRobotDataset(BaseActionLeRobotDataset): + """Action wrapper for the Fractal (Google RT-1) dataset in LeRobot format.""" + + def __init__( + self, + root: str = "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/fractal20220817_data_no_noops", + fps: float = 3.0, + chunk_length: int = 16, + split_seed: int = 42, + split_val_ratio: float = 0.05, + split: str = "train", + mode: str = "policy", + pose_convention: PoseConvention = "backward_framewise", + action_normalization: ActionNormalization | None = None, + viewpoint: Viewpoint = "ego_view", + enable_fast_init: bool = False, + ) -> None: + """Initialize FractalLeRobotDataset. + + Args: + root: Path to the local LeRobot dataset root. + fps: Frames per second of the dataset. + chunk_length: Number of action frames per sample. + split_seed: Seed for deterministic train/val splitting. + split_val_ratio: Fraction of episodes held out for validation. + split: One of "train", "val", or "full". + mode: Training mode — "policy", "forward_dynamics", + "inverse_dynamics", "image2video", or "joint". + pose_convention: Relative-pose convention used to encode SE(3) + actions. Supports ``"backward_framewise"`` and + ``"backward_anchored"``. Set to ``None`` to disable action + construction outside image-to-video mode. + action_normalization: Optional bundled-stats normalization + (``"quantile"`` / ``"quantile_rot"`` / ``"meanstd"`` / ``"minmax"``); + ``None`` returns raw actions. + viewpoint: Camera viewpoint type for this dataset. + """ + super().__init__( + fps=fps, + chunk_length=chunk_length, + split_seed=split_seed, + split_val_ratio=split_val_ratio, + split=split, + mode=mode, + embodiment_type="fractal", + viewpoint=viewpoint, + pose_convention=pose_convention, + rotation_format="rot6d", + action_normalization=action_normalization, + tolerance_s=1e-4, + enable_fast_init=enable_fast_init, + ) + + self._to_opencv = _GOOGLE_ROBOT_TO_OPENCV + self._all_shard_roots = [root] + + self._delta_timestamps = { + "observation.images.image": [i * self._dt for i in range(0, self._chunk_length + 1)], + "observation.state": [i * self._dt for i in range(0, self._chunk_length + 1)], + "action": [i * self._dt for i in range(0, self._chunk_length)], + } + + def _filter_valid_episodes(self, meta: LeRobotDatasetMetadata, episode_ids: list[int]) -> list[int]: + """Drop known-bad raw Fractal episode IDs before index spans are built.""" + kept = [ep_id for ep_id in episode_ids if ep_id not in _SKIPPED_EPISODE_IDS] + dropped = len(episode_ids) - len(kept) + if dropped: + log.info( + f"FractalLeRobotDataset: dropped {dropped} / {len(episode_ids)} " + f"episodes from skip list {sorted(_SKIPPED_EPISODE_IDS)} " + f"(total_episodes={meta.total_episodes})" + ) + return kept + + def _build_action_spec(self) -> ActionSpec: + """Fractal: 10D = ``[Pos(3), Rot6d(6), Gripper(1)]``.""" + return build_action_spec(Pos(dim=3), Rot("rot6d"), Gripper()) + + def __getitem__(self, idx: int) -> dict[str, Any]: + """Return a single training sample.""" + mode, _, _, sample = self._fetch_sample(idx) + + ai_caption = sample["task"] + + video = sample["observation.images.image"] # [T,C,H,W] + + # State layout: [x, y, z, rx, ry, rz, rw, gripper] (T+1 frames) + # Quaternion order from dataset: (rx, ry, rz, rw) matches scipy's (x, y, z, w). + state = sample["observation.state"] # [T+1, 8] + poses_abs = build_abs_pose_from_components( + state[:, 0:3], + state[:, 3:7], + "quat_xyzw", + ) + # 1. TCP → flange: shift from link_gripper_tcp to link_gripper + poses_abs = poses_abs @ _TCP_TO_FLANGE + # 2. Kinematics → OpenCV convention (rotation only) + poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ self._to_opencv + initial_pose = torch.from_numpy(poses_abs[0].copy()).float() + poses_rel = pose_abs_to_rel( + poses_abs, + rotation_format="rot6d", + pose_convention=cast(PoseConvention, self._pose_convention), + ) + action = torch.cat( + [ + torch.from_numpy(poses_rel).float(), # SE3 relative pose (rot6d) + sample["action"][:, [6]], # gripper (1D) + ], + dim=-1, + ) # [T, 10] + return self._build_result( + mode=mode, video=video, action=action, ai_caption=ai_caption, initial_pose=initial_pose + ) + + @property + def action_dim(self) -> int: + """Action dimensionality: position(3) + 6D rotation(6) + gripper(1) = 10.""" + return 10 diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/libero_dataset.py b/cosmos-framework/cosmos_framework/data/vfm/action/libero_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..91d24137bc9a3a160241261d86587610b5f6e69e --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/libero_dataset.py @@ -0,0 +1,611 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""LIBERO dataset for training from local storage, supporting multiple dataset roots.""" + +import random +from pathlib import Path +from typing import Literal + +import torch +import torchvision.transforms.functional as F +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from torch.utils.data import Dataset + +from cosmos_framework.utils import log +from cosmos_framework.data.vfm.action.action_normalization import ( + load_action_stats, + normalize_action, +) +from cosmos_framework.data.vfm.action.action_spec import ( + Gripper, + Pos, + Rot, + build_action_spec, +) +from cosmos_framework.data.vfm.action.domain_utils import get_domain_id +from cosmos_framework.data.vfm.action.libero_pose_utils import ( + libero_action_dim, + libero_rotation_format, +) +from cosmos_framework.data.vfm.action.pose_utils import ( + compute_idle_frames, + convert_rotation, +) + +LIBERO_ROOTS: list[str] = [ + "", + "", + "", + "", + "", +] + + +class LIBERODataset(Dataset): + """ + A Dataset wrapper for LeRobot LIBERO dataset(s) designed for training from local storage. + + This dataset: + - Loads data from local storage using LeRobotDataset + - Supports multiple dataset roots that are concatenated into one dataset + - Supports configurable camera modes (image, wrist_image, or concat_view) + - Filters episodes for train/val split + - Filters frames at episode boundaries (to avoid padding issues with delta timestamps) + - Uses task descriptions from meta/tasks.parquet for ai_caption + """ + + _NORMALIZERS_DIR = Path(__file__).parent / "normalizers" + + def __init__( + self, + repo_id: str | list[str] = "lerobot/libero_90", + root: str | list[str] | None = LIBERO_ROOTS, + image_size: int = 256, + chunk_length: int = 16, # must be divisible by 4 + fps: int = 10, # IMPORTANT! LIBERO is at 20fps. If using frame_wise_relative in policy mode, we have to match the fps. + mode: str = "policy", + video_backend: str | None = "torchcodec", + download_videos: bool = False, + force_cache_sync: bool = False, + tolerance_s: float = 1e-4, + split: str = "train", + val_ratio: float = 0.01, + seed: int = 0, + # Camera configuration + camera_mode: str = "image", # 'image', 'wrist_image', or 'concat_view' + # Action configuration + action_space: str = "frame_wise_relative", # "absolute" or "relative" or "frame_wise_relative" + # rotation_space + rotation_space: Literal["9d", "6d", "3d"] = "3d", + # Native simulator frame or shared OpenCV-style EE frame used by midtraining. + pose_coordinate_frame: Literal["native", "opencv"] = "native", + # domain-aware configuration + embodiment_type: str = "libero", + action_normalization: Literal["quantile", "quantile_rot", "meanstd", "minmax"] | None = None, + action_stats_path: str | None = None, + skip_video_loading: bool = False, + ): + super().__init__() + self._embodiment_type = embodiment_type + self.domain_id = get_domain_id(embodiment_type) + self.image_size = image_size + self.chunk_length = chunk_length + assert self.chunk_length % 4 == 0, "chunk_length must be divisible by 4" + self.fps = fps + self.mode = mode + self.split = split.lower().strip() + self.val_ratio = val_ratio + self.seed = seed + self.camera_mode = camera_mode.lower().strip() + self.action_space = action_space + self.action_normalization = action_normalization + self.rotation_space = rotation_space.lower().strip() + self.pose_coordinate_frame = pose_coordinate_frame + self._pose_convention = self.action_space + self._rotation_format = libero_rotation_format(self.rotation_space) + # When True, skip video decoding entirely: drop image keys from + # delta_timestamps so LeRobot never touches the mp4, and return + # ``video=None`` in __getitem__. Must be set at construction time + # because LeRobotDataset is eagerly built in __init__. + self._skip_video_loading = bool(skip_video_loading) + + # Load action normalization stats. ``action_min`` / ``action_range`` are + # retained for older LIBERO eval code that knows how to invert a + # range-style [-1, 1] normalization. + self._norm_stats: dict[str, torch.Tensor] | None = None + self.action_min: torch.Tensor | None = None + self.action_max: torch.Tensor | None = None + self.action_range: torch.Tensor | None = None + if self.action_normalization is not None: + stats_path = self._resolve_action_stats_path(action_stats_path) + stats_key = "global_raw" if self.action_normalization == "quantile_rot" else "global" + raw_stats = load_action_stats(str(stats_path), stats_key=stats_key) + self._norm_stats = {} + for key, value in raw_stats.items(): + self._norm_stats[key] = torch.from_numpy(value).float() # [D] + self._set_range_denormalization_stats() + log.info( + f"Loaded LIBERO action stats from {stats_path} with action_normalization={self.action_normalization}" + ) + + # Validate camera mode + if self.camera_mode not in {"image", "wrist_image", "concat_view"}: + raise ValueError(f"Unsupported camera_mode={camera_mode!r}. Use 'image', 'wrist_image', or 'concat_view'.") + + # Validate split + if self.split not in {"train", "val", "valid", "validation", "eval", "test", "full"}: + raise ValueError(f"Unsupported {split=}. Use train/val/full.") + + # Build delta timestamps based on camera mode + dt = 1.0 / self.fps + + if self.fps != 20: + log.warning( + f"LIBERO is at 20fps. If using frame_wise_relative for policy mode training, we have to match the fps. fps={self.fps}" + ) + + # Determine which image keys to use + if self.camera_mode == "image": + self.image_keys = ["observation.images.image"] + elif self.camera_mode == "wrist_image": + self.image_keys = ["observation.images.wrist_image"] + else: # concat_view + self.image_keys = ["observation.images.image", "observation.images.wrist_image"] + + # Build delta_timestamps for all keys (same convention as PushT: 0 to chunk_length) + self.delta_timestamps: dict[str, list[float]] = {} + if not self._skip_video_loading: + for key in self.image_keys: + self.delta_timestamps[key] = [i * dt for i in range(0, chunk_length + 1)] + self.delta_timestamps["observation.state"] = [i * dt for i in range(0, chunk_length + 1)] + self.delta_timestamps["action"] = [i * dt for i in range(0, chunk_length + 1)] + + # Normalize repo_id and root to lists + repo_id_list: list[str] = [repo_id] if isinstance(repo_id, str) else list(repo_id) + root_list: list[str | None] + if root is None: + root_list = [None for _ in repo_id_list] + elif isinstance(root, str): + root_list = [root] + else: + root_list = [r for r in root] + + if len(repo_id_list) != len(root_list): + raise ValueError( + f"Length mismatch: repo_id has {len(repo_id_list)} items, root has {len(root_list)} items." + ) + + # Load all datasets + self.datasets: list[LeRobotDataset] = [] + self.tasks_dfs: list = [] # Store tasks DataFrames for each dataset + for rid, r in zip(repo_id_list, root_list): + dataset = LeRobotDataset( + repo_id=rid, + root=r, + delta_timestamps=self.delta_timestamps, # type: ignore + tolerance_s=tolerance_s, + force_cache_sync=force_cache_sync, + download_videos=download_videos, + video_backend=video_backend, + episodes=None, # Load full dataset, filter later + ) + self.datasets.append(dataset) + self.tasks_dfs.append(dataset.meta.tasks) + + # Build index mapping: list of (dataset_idx, local_idx) for valid frames + self.index_map: list[tuple[int, int, int]] = [] # (dataset_idx, local_idx, episode_idx) + self._episode_boundaries: list[dict[int, tuple[int, int]]] = [] + self._episode_splits: list[tuple[set[int], set[int]]] = [] + + total_episodes = 0 + total_frames = 0 + for ds_idx, dataset in enumerate(self.datasets): + # Compute episode splits for this dataset + train_eps, val_eps = self._compute_episode_splits_for_dataset(dataset) + self._episode_splits.append((train_eps, val_eps)) + + # Get episodes for current split + split_episodes = self._get_split_episodes_for_dataset(ds_idx) + + # Build episode boundaries + boundaries = self._build_episode_boundaries_for_dataset(dataset) + self._episode_boundaries.append(boundaries) + + # Filter indices + indices = self._filter_indices_for_dataset(ds_idx, dataset, split_episodes, boundaries) + self.index_map.extend(indices) + + total_episodes += dataset.num_episodes + total_frames += len(dataset) + + log.info( + f"Loaded LIBERO dataset with {len(repo_id_list)} source(s) split={self.split!r} " + f"camera_mode={self.camera_mode!r} " + f"total_episodes={total_episodes} " + f"total_frames={total_frames} " + f"valid_indices={len(self.index_map)}" + ) + + def _compute_episode_splits_for_dataset(self, dataset: LeRobotDataset) -> tuple[set[int], set[int]]: + """Compute train/val episode splits deterministically for a single dataset.""" + total_episodes = int(dataset.meta.total_episodes) + + if not (0.0 < self.val_ratio < 1.0): + raise ValueError(f"{self.val_ratio=} must be in (0, 1).") + + n_val = max(1, int(round(total_episodes * self.val_ratio))) + # val_eps = set(range(n_val)) + # train_eps = set(range(n_val, total_episodes)) + + # Yihuai: Randomly select validation episodes instead of the first n_val episodes (otherwise task will be repeated) + rng = random.Random(self.seed) # To ensure validation episodes are the same on all ranks + val_eps = set(rng.sample(range(total_episodes), n_val)) + train_eps = set(range(total_episodes)) - val_eps + + log.info(f"train_eps={train_eps}, val_eps={val_eps}") + + return train_eps, val_eps + + def _get_split_episodes_for_dataset(self, ds_idx: int) -> set[int]: + """Get the episode set for the current split for a specific dataset.""" + train_eps, val_eps = self._episode_splits[ds_idx] + if self.split in {"val", "valid", "validation", "eval", "test"}: + return val_eps + elif self.split == "train": + return train_eps + else: # full + return train_eps | val_eps + + def _build_episode_boundaries_for_dataset(self, dataset: LeRobotDataset) -> dict[int, tuple[int, int]]: + """Build a dict of episode_index -> (start_frame, end_frame) for a single dataset.""" + boundaries: dict[int, tuple[int, int]] = {} + for ep in dataset.meta.episodes: + ep_idx = int(ep["episode_index"]) # type: ignore[index] + start = int(ep["dataset_from_index"]) # type: ignore[index] + end = int(ep["dataset_to_index"]) # type: ignore[index] + boundaries[ep_idx] = (start, end) + return boundaries + + def _filter_indices_for_dataset( + self, + ds_idx: int, + dataset: LeRobotDataset, + split_episodes: set[int], + boundaries: dict[int, tuple[int, int]], + ) -> list[tuple[int, int, int]]: + """Filter valid indices for a single dataset, returning (dataset_idx, local_idx, episode_idx).""" + index_map: list[tuple[int, int, int]] = [] + all_meta = list(dataset.meta.episodes) + + for ep_idx in split_episodes: + if ep_idx >= len(all_meta): + continue + ep = all_meta[ep_idx] + + ep_start = int(ep["dataset_from_index"]) # type: ignore[index] + ep_end = int(ep["dataset_to_index"]) # type: ignore[index] + + # Valid range: [start, end - chunk_length - 1] inclusive + # We drop chunk_length frames at end to ensure we can query up to delta=chunk_length. + start = ep_start + end = ep_end - self.chunk_length - 1 + + if end >= start: + for local_idx in range(start, end + 1): + index_map.append((ds_idx, local_idx, ep_idx)) + + return index_map + + def __len__(self) -> int: + return len(self.index_map) + + def _get_task_description(self, ds_idx: int, item: dict) -> str: + """Get task description for the current item from meta/tasks.parquet. + + The tasks.parquet has task descriptions as the DataFrame index (row labels) + and task_index as an integer column. We look up by task_index and return + the corresponding index name (the actual task description string). + """ + task_idx = item.get("task_index") + if task_idx is not None: + if isinstance(task_idx, torch.Tensor): + task_idx = task_idx.item() + task_idx = int(task_idx) + tasks_df = self.tasks_dfs[ds_idx] + if task_idx in tasks_df["task_index"].values: + row = tasks_df[tasks_df["task_index"] == task_idx].iloc[0] + # The task description is the index name (row label), not a column value + return str(row.name) + raise ValueError(f"Task index {task_idx} not found in tasks.parquet for dataset {ds_idx}") + + def _compute_anchored_actions( + self, + state_raw: torch.Tensor, + action_raw: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute anchored relative actions (batched). + + Converts frame-wise relative actions to anchored relative actions where each + action[t] represents the target pose (after applying action[t] to state[t]) + expressed in state 0's local coordinate frame. + + Mathematical formulation: + 1. Compute target in world frame (LIBERO convention): + - p_{t+1} = p_t + delta_p[t] (position addition in world frame) + - R_{t+1} = R_delta[t] @ R_t (rotation composition, delta first) + 2. Compute anchored (left-multiply by T_0^{-1}): + - anchored_pos[t] = R_0^T @ (p_{t+1} - p_0) + - anchored_rot[t] = R_0^T @ R_{t+1} + + Args: + state_raw: State tensor of shape (T+1, 8): [x, y, z, ax, ay, az, grip1, grip2] + where (ax, ay, az) is axis-angle rotation. + action_raw: Action tensor of shape (T+1, 7): [dx, dy, dz, dax, day, daz, grip] + where (dax, day, daz) is axis-angle rotation delta. + + Returns: + anchored_translation: (T, 3) - position in state_0's local frame + anchored_rotation_9d: (T, 9) - rotation relative to state_0 as flattened 3x3 matrix + gripper: (T, 1) - original gripper commands (unchanged) + """ + # Extract positions and rotations from states + p_states = state_raw[:, :3] # [T+1,3] + rotvec_states = state_raw[:, 3:6] # [T+1,3] - axis-angle + + # Extract deltas from actions (use first T actions) + delta_p = action_raw[:-1, :3] # [T,3] + delta_rotvec = action_raw[:-1, 3:6] # [T,3] - axis-angle delta + gripper = action_raw[:-1, 6:7] # [T,1] + + # Convert all axis-angle to rotation matrices (batched) + R_states = convert_rotation(rotvec_states, input_format="axisangle", output_format="matrix") # [T+1,3,3] + R_deltas = convert_rotation(delta_rotvec, input_format="axisangle", output_format="matrix") # [T,3,3] + + # Initial pose (state 0) + p_0 = p_states[0] # [3] + R_0 = R_states[0] # [3,3] + R_0_T = R_0.T # [3,3] - transpose for inverse rotation + + # Current states for t = 0..T-1 + p_t = p_states[:-1] # [T,3] + R_t = R_states[:-1] # [T,3,3] + + # Step 1: Compute target poses in world frame (LIBERO convention) + # p_target = p_t + delta_p + p_target = p_t + delta_p # [T,3] + + # R_target = R_delta @ R_t (batched matrix multiply) + R_target = torch.bmm(R_deltas, R_t) # [T,3,3] + + # Step 2: Compute anchored (in state_0's local frame) + # anchored_p = R_0^T @ (p_target - p_0) + displacement = p_target - p_0 # [T,3] + anchored_p = (R_0_T @ displacement.T).T # [T,3] + + # anchored_R = R_0^T @ R_target (batched) + R_0_T_expanded = R_0_T.unsqueeze(0).expand(R_target.shape[0], -1, -1) # [T,3,3] + anchored_R = torch.bmm(R_0_T_expanded, R_target) # [T,3,3] + + return anchored_p, anchored_R, gripper + + def _convert_rotation_to_repr(self, rotation_matrix: torch.Tensor) -> torch.Tensor: + """Convert rotation matrix to the desired representation. + + Args: + rotation_matrix: Rotation matrices of shape (T, 3, 3). + + Returns: + Rotation in the configured ``rotation_space`` format. + """ + return convert_rotation(rotation_matrix, "matrix", libero_rotation_format(self.rotation_space)) + + def _normalizer_filename(self) -> str: + rotation_suffix = { + "3d": "3d", + "6d": "rot6d", + "9d": "rot9d", + }.get(self.rotation_space) + if rotation_suffix is None: + raise ValueError(f"Unsupported rotation_space={self.rotation_space!r}.") + action_space = self.action_space.replace("-", "_") + return f"{self._embodiment_type}_{action_space}_{rotation_suffix}.json" + + def _resolve_action_stats_path(self, action_stats_path: str | None) -> Path: + if action_stats_path is None: + stats_path = self._NORMALIZERS_DIR / self._normalizer_filename() + if stats_path.exists(): + return stats_path + raise FileNotFoundError( + f"Could not find bundled LIBERO action stats at {stats_path}. " + "Pass action_stats_path explicitly or regenerate stats with compute_action_stats.py." + ) + + stats_path = Path(action_stats_path) + if stats_path.is_absolute(): + if stats_path.exists(): + return stats_path + raise FileNotFoundError(f"Could not find action_stats_path={action_stats_path!r}.") + + module_dir = Path(__file__).resolve().parent + candidates: list[Path] = [] + for parent in module_dir.parents: + candidates.append(parent / stats_path) + candidates.append(self._NORMALIZERS_DIR / stats_path.name) + candidates.append(module_dir / stats_path.name) + for candidate in candidates: + if candidate.exists(): + return candidate + raise FileNotFoundError( + f"Could not resolve action_stats_path={action_stats_path!r}; tried: {[str(c) for c in candidates]}" + ) + + def _set_range_denormalization_stats(self) -> None: + if self._norm_stats is None: + return + + if self.action_normalization == "minmax": + lo_key, hi_key = "min", "max" + elif self.action_normalization in ("quantile", "quantile_rot"): + lo_key, hi_key = "q01", "q99" + else: + return + + if lo_key not in self._norm_stats or hi_key not in self._norm_stats: + raise ValueError( + f"Action stats for {self.action_normalization!r} normalization require " + f"{lo_key!r} and {hi_key!r} entries." + ) + self.action_min = self._norm_stats[lo_key] # [D] + self.action_max = self._norm_stats[hi_key] # [D] + action_range = self.action_max - self.action_min # [D] + self.action_range = torch.clamp(action_range, min=1e-6) # [D] + + def __getitem__(self, idx: int, _retry_count: int = 0) -> dict[str, torch.Tensor | str]: + """Get a single item from the dataset.""" + max_retries = 10 + ds_idx, local_idx, ep_idx = self.index_map[idx] + dataset = self.datasets[ds_idx] + try: + item = dataset[local_idx] + except Exception as e: + log.warning( + f"Error loading item (retry {_retry_count}/{max_retries}): idx={idx}, ds_idx={ds_idx}, " + f"local_idx={local_idx}, ep_idx={ep_idx}, repo_id={dataset.meta.repo_id}, error={e}" + ) + if _retry_count >= max_retries: + raise RuntimeError(f"Failed to load data after {max_retries} retries") from e + new_idx = random.randint(0, len(self) - 1) + return self.__getitem__(new_idx, _retry_count + 1) + + if self.mode == "joint": + mode = random.choice(["forward_dynamics", "inverse_dynamics", "policy", "image2video"]) + else: + mode = self.mode + + # Get task description for ai_caption + task_description = self._get_task_description(ds_idx, item) + + # Process video based on camera mode (skipped entirely when + # skip_video_loading=True; image keys are also absent from + # delta_timestamps so LeRobot never decoded them). + video: torch.Tensor | None + if self._skip_video_loading: + video = None + else: + if self.camera_mode == "concat_view": + # Load both cameras and concatenate horizontally + video_1: torch.Tensor = item["observation.images.image"] + video_2: torch.Tensor = item["observation.images.wrist_image"] + + # Resize each if needed + if video_1.shape[-1] != self.image_size or video_1.shape[-2] != self.image_size: + video_1 = F.resize(video_1, [self.image_size, self.image_size]) + if video_2.shape[-1] != self.image_size or video_2.shape[-2] != self.image_size: + video_2 = F.resize(video_2, [self.image_size, self.image_size]) + + # Concatenate along width dimension (last dim for TCHW) + video_tchw = torch.cat([video_1, video_2], dim=-1) # (T, C, H, W*2) + else: + # Single camera mode + image_key = self.image_keys[0] + video_tchw = item[image_key] + + # Resize if needed + if video_tchw.shape[-1] != self.image_size or video_tchw.shape[-2] != self.image_size: + video_tchw = F.resize(video_tchw, [self.image_size, self.image_size]) + + # Convert to uint8 and transpose to (C, T, H, W) + video = (video_tchw * 255).clamp(0, 255).to(torch.uint8).permute(1, 0, 2, 3) + + # Action (raw): LIBERO actions are 7D (6 DoF + gripper) + action_raw: torch.Tensor = item["action"] + # State (raw): LIBERO state is 8D (6 DoF + 2 gripper states) + state_raw: torch.Tensor = item["observation.state"] + + # Action: (T+1, D) -> (T, D) + # Take all but last action + # LIBERO action format: [x, y, z, ax, ay, az, gripper] (7D) where (ax,ay,az) is axis-angle + + if self.action_space == "relative": + # Compute anchored relative actions + # Returns: translation (T, 3), rotation_matrix (T, 3, 3), gripper (T, 1) + translation, rotation_matrix, gripper = self._compute_anchored_actions(state_raw, action_raw.clone()) + elif self.action_space == "frame_wise_relative": + action = action_raw[:-1].clone() # [T,7] + translation = action[:, :3] # [T,3] + rotation_rotvec = action[:, 3:6] # [T,3] + gripper = action[:, 6:] # [T,1] + rotation_matrix = convert_rotation( + rotation_rotvec, input_format="axisangle", output_format="matrix" + ) # [T,3,3] + else: + raise ValueError(f"Unsupported action space: {self.action_space}") + + rotation = self._convert_rotation_to_repr(rotation_matrix) # [T,rot_dim] + action = torch.cat([translation, rotation, gripper], dim=-1) # [T,action_dim] + + # Compute idle_frames from the raw (un-normalized) action, only when the + # action layout has correct per-frame idle semantics (frame_wise_relative + # ⇔ backward_framewise). The other action_spaces ("relative", + # "absolute") encode per-frame motion differently and would not give + # meaningful idle counts under the same threshold check. + idle_frames: torch.Tensor | None = None + if self.action_space == "frame_wise_relative": + try: + spec = build_action_spec(Pos(), Rot(libero_rotation_format(self.rotation_space)), Gripper()) + n = compute_idle_frames(action, spec) + idle_frames = torch.tensor(n, dtype=torch.long) + except (ValueError, TypeError): + idle_frames = None + + if self.action_normalization is not None and self._norm_stats is not None and self.action_min is not None: + if action.shape[-1] != self.action_min.shape[0]: + raise ValueError( + f"Action dimension {action.shape[-1]} does not match stats dimension " + f"{self.action_min.shape[0]}. Recompute stats for the current " + f"rotation_space={self.rotation_space!r} and action_space={self.action_space!r}." + ) + method = "quantile" if self.action_normalization == "quantile_rot" else self.action_normalization + action = normalize_action(action, method, self._norm_stats) # [T,D] + + # Index + key = torch.tensor([local_idx], dtype=torch.long) + + if self.camera_mode == "image": + viewpoint = "third_person_view" + elif self.camera_mode == "wrist_image": + viewpoint = "wrist_view" + else: + viewpoint = "concat_view" + + result: dict[str, torch.Tensor | str] = { + "source_repo_id": dataset.meta.repo_id, + "video": video, + "action": action, + "action_raw": action_raw, + "conditioning_fps": torch.tensor(self.fps, dtype=torch.long), + "prompt": task_description, + "ai_caption": task_description, + "mode": mode, + "state": state_raw, + "action_space": self.action_space, + "rotation_space": self.rotation_space, + "pose_coordinate_frame": self.pose_coordinate_frame, + "__key__": key, + "domain_id": torch.tensor(self.domain_id, dtype=torch.long), + "viewpoint": viewpoint, + } + if idle_frames is not None: + result["idle_frames"] = idle_frames + + if self.camera_mode == "concat_view" and not self._skip_video_loading: + result["additional_view_description"] = ( + "The left half shows the third-person view; the right half shows the wrist-mounted camera." + ) + + return result + + @property + def action_dim(self) -> int: + return libero_action_dim(self.rotation_space) diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/libero_pose_utils.py b/cosmos-framework/cosmos_framework/data/vfm/action/libero_pose_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5cc9fffd99e3a9e9ca89d7cc60043fbeafe4c5a6 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/libero_pose_utils.py @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Small LIBERO pose helpers shared by training and closed-loop eval.""" + +from __future__ import annotations + +import numpy as np +import torch + +from cosmos_framework.data.vfm.action.pose_utils import ( + RotationConvention, + build_abs_pose_from_components, +) + +# Same local-frame post-rotation pattern used by DROID/Bridge/Fractal: +# R_opencv = R_native @ *_TO_OPENCV. +LIBERO_TO_OPENCV: np.ndarray = np.array( + [[0.0, -1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 1.0]], + dtype=np.float32, +) + +LIBERO_ROTATION_FORMATS: dict[str, RotationConvention] = { + "3d": "axisangle", + "6d": "rot6d", + "9d": "rot9d", +} +LIBERO_ACTION_DIMS: dict[str, int] = {"3d": 7, "6d": 10, "9d": 13} + + +def libero_rotation_format(rotation_space: str) -> RotationConvention: + """Return the shared ``pose_utils`` rotation format for a LIBERO setting.""" + rotation_format = LIBERO_ROTATION_FORMATS.get(rotation_space) + if rotation_format is None: + raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.") + return rotation_format + + +def libero_action_dim(rotation_space: str) -> int: + """Return ``[xyz, rotation, gripper]`` action width for LIBERO.""" + action_dim = LIBERO_ACTION_DIMS.get(rotation_space) + if action_dim is None: + raise ValueError(f"Unsupported rotation_space={rotation_space!r}. Use 3d/6d/9d.") + return action_dim + + +def libero_rotation_space_from_action_dim(action_dim: int) -> str: + """Infer LIBERO rotation space from unpadded action width.""" + for rotation_space, dim in LIBERO_ACTION_DIMS.items(): + if dim == action_dim: + return rotation_space + raise ValueError(f"Unable to infer rotation_space from action_dim={action_dim}.") + + +def build_libero_abs_pose(state_raw: torch.Tensor | np.ndarray, *, to_opencv: bool) -> np.ndarray: + """Build absolute LIBERO EE poses from state rows. + + ``state_raw`` is ``[x,y,z,axisangle(3),gripper(2)]``. When requested, the + local EE frame is post-rotated into the shared OpenCV-style action frame. + """ + if isinstance(state_raw, torch.Tensor): + state_np = state_raw.detach().cpu().numpy().astype(np.float32, copy=False) + else: + state_np = np.asarray(state_raw, dtype=np.float32) + + poses_abs = build_abs_pose_from_components(state_np[:, :3], state_np[:, 3:6], "axisangle") + if to_opencv: + poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ LIBERO_TO_OPENCV + return poses_abs diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/bridge_orig_lerobot_backward_framewise_rot6d.json b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/bridge_orig_lerobot_backward_framewise_rot6d.json new file mode 100644 index 0000000000000000000000000000000000000000..30ac5abcea91739f1195bd4edffc2778ad6f1c02 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/bridge_orig_lerobot_backward_framewise_rot6d.json @@ -0,0 +1,33 @@ +{ + "metadata": { + "embodiment_type": "bridge_orig_lerobot", + "pose_convention": "backward_framewise", + "rotation_format": "rot6d", + "action_dim": 10, + "skip_rotation_dims": [3, 4, 5, 6, 7, 8], + "chunk_length": 16, + "sample_stride": 16, + "dataset_name": "bridge_20260416", + "dataset_class": "BridgeOrigLeRobotDataset", + "dataset_root": "", + "split": "train", + "num_samples_stats": 83036, + "reservoir_size": 5000000 + }, + "global": { + "mean": [-0.000094, -0.000394, 0.001623, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.582683], + "std": [ 0.013297, 0.009985, 0.012079, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.489959], + "min": [-0.309451, -0.074740, -0.082767, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "max": [ 0.127018, 0.414660, 0.493186, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000], + "q01": [-0.038884, -0.028667, -0.037840, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "q99": [ 0.039722, 0.029068, 0.026702, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000] + }, + "global_raw": { + "mean": [-0.000094, -0.000394, 0.001623, 0.998307, -0.001371, 0.000061, 0.001414, 0.998226, -0.000154, 0.582683], + "std": [ 0.013297, 0.009985, 0.012079, 0.004630, 0.050168, 0.029018, 0.050165, 0.004328, 0.031742, 0.489959], + "min": [-0.309451, -0.074740, -0.082767, -0.845782, -0.636628, -0.401535, -0.590214, -0.217448, -0.979635, 0.000000], + "max": [ 0.127018, 0.414660, 0.493186, 1.000000, 0.362611, 0.601211, 0.619479, 1.000000, 0.365993, 1.000000], + "q01": [-0.038884, -0.028667, -0.037840, 0.976292, -0.163098, -0.081545, -0.160193, 0.976322, -0.078872, 0.000000], + "q99": [ 0.039722, 0.029068, 0.026702, 1.000000, 0.160195, 0.081655, 0.163227, 1.000000, 0.095189, 1.000000] + } +} diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/droid_lerobot_backward_framewise_rot6d.json b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/droid_lerobot_backward_framewise_rot6d.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bd81ed7763cf4711ca156d4ad7cb062e67f545 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/droid_lerobot_backward_framewise_rot6d.json @@ -0,0 +1,33 @@ +{ + "metadata": { + "embodiment_type": "droid_lerobot", + "pose_convention": "backward_framewise", + "rotation_format": "rot6d", + "action_dim": 10, + "skip_rotation_dims": [3, 4, 5, 6, 7, 8], + "chunk_length": 16, + "sample_stride": 16, + "dataset_name": "droid_20260418", + "dataset_class": "DROIDLeRobotDataset", + "dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/droid_plus_lerobot_640x360_20260412", + "split": "train", + "num_samples_stats": 1321153, + "reservoir_size": 5000000 + }, + "global": { + "mean": [-0.000017, -0.000612, 0.000568, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.588911], + "std": [ 0.004539, 0.004054, 0.004999, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.441186], + "min": [-0.075397, -0.057288, -0.056677, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "max": [ 0.073107, 0.082187, 0.077080, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000], + "q01": [-0.014200, -0.013416, -0.015206, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "q99": [ 0.014515, 0.011517, 0.014520, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000] + }, + "global_raw": { + "mean": [-0.000017, -0.000612, 0.000568, 0.999830, 0.000227, -0.000152, -0.000222, 0.999818, 0.000417, 0.588911], + "std": [ 0.004539, 0.004054, 0.004999, 0.000336, 0.014924, 0.010784, 0.014927, 0.000351, 0.011903, 0.441186], + "min": [-0.075397, -0.057288, -0.056677, 0.695640, -0.220599, -0.195892, -0.697421, 0.600468, -0.154176, 0.000000], + "max": [ 0.073107, 0.082187, 0.077080, 1.000000, 0.698449, 0.168089, 0.220605, 1.000000, 0.391206, 1.000000], + "q01": [-0.014200, -0.013416, -0.015206, 0.998459, -0.047659, -0.034774, -0.047609, 0.998428, -0.035553, 0.000000], + "q99": [ 0.014515, 0.011517, 0.014520, 1.000000, 0.047596, 0.034660, 0.047654, 1.000000, 0.038888, 1.000000] + } +} diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/fractal_backward_framewise_rot6d.json b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/fractal_backward_framewise_rot6d.json new file mode 100644 index 0000000000000000000000000000000000000000..ffbbfcd6b1da0143cddcd70278c7004f1b8fb0b1 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/fractal_backward_framewise_rot6d.json @@ -0,0 +1,33 @@ +{ + "metadata": { + "embodiment_type": "fractal", + "pose_convention": "backward_framewise", + "rotation_format": "rot6d", + "action_dim": 10, + "skip_rotation_dims": [3, 4, 5, 6, 7, 8], + "chunk_length": 16, + "sample_stride": 16, + "dataset_name": "fractal_20260413", + "dataset_class": "FractalLeRobotDataset", + "dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/fractal20220817_data_no_noops", + "split": "train", + "num_samples_stats": 166961, + "reservoir_size": 5000000 + }, + "global": { + "mean": [ 0.002259, 0.000721, 0.009372, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.526947], + "std": [ 0.014178, 0.016428, 0.022554, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.499273], + "min": [-0.151886, -0.176424, -0.194576, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "max": [ 0.130892, 0.190835, 0.193839, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000], + "q01": [-0.039816, -0.049270, -0.056266, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "q99": [ 0.043860, 0.050352, 0.072505, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000] + }, + "global_raw": { + "mean": [ 0.002259, 0.000721, 0.009372, 0.998347, 0.001789, 0.002694, -0.001861, 0.997811, 0.016366, 0.526947], + "std": [ 0.014178, 0.016428, 0.022554, 0.003377, 0.043416, 0.037369, 0.043211, 0.004566, 0.047057, 0.499273], + "min": [-0.151886, -0.176424, -0.194576, 0.520558, -0.676280, -0.822475, -0.460521, 0.736643, -0.517041, 0.000000], + "max": [ 0.130892, 0.190835, 0.193839, 1.000000, 0.461026, 0.403940, 0.671708, 1.000000, 0.505528, 1.000000], + "q01": [-0.039816, -0.049270, -0.056266, 0.983667, -0.134543, -0.107048, -0.126518, 0.977277, -0.091363, 0.000000], + "q99": [ 0.043860, 0.050352, 0.072505, 1.000000, 0.127404, 0.107273, 0.134140, 1.000000, 0.179731, 1.000000] + } +} diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/libero_native_frame_wise_relative_rot6d.json b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/libero_native_frame_wise_relative_rot6d.json new file mode 100644 index 0000000000000000000000000000000000000000..6cde6705bc940460761df6a2bfdfa8f3d074f433 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/libero_native_frame_wise_relative_rot6d.json @@ -0,0 +1,37 @@ +{ + "metadata": { + "embodiment_type": "libero", + "pose_convention": "frame_wise_relative", + "pose_coordinate_frame": "native", + "rotation_format": "6d", + "action_dim": 10, + "skip_rotation_dims": [3, 4, 5, 6, 7, 8], + "chunk_length": 16, + "sample_stride": null, + "dataset_name": "libero", + "dataset_class": "LIBERODataset", + "dataset_root": ["outputs/libero_datasets/libero_10", "outputs/libero_datasets/libero_object", "outputs/libero_datasets/libero_spatial", "outputs/libero_datasets/libero_goal"], + "_comment": "Dataset paths are placeholders; the statistics values are independent of local dataset location.", + "split": "train", + "num_samples_stats": 10000, + "reservoir_size": 50000, + "max_samples": 10000, + "sampling_seed": 42 + }, + "global": { + "mean": [ 0.050704, 0.097407, -0.094833, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.476725], + "std": [ 0.333621, 0.387175, 0.457140, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.499460], + "min": [-0.937500, -0.937500, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "max": [ 0.937500, 0.937500, 0.937500, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000], + "q01": [-0.723214, -0.808929, -0.937500, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "q99": [ 0.937500, 0.870536, 0.937500, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000] + }, + "global_raw": { + "mean": [ 0.050704, 0.097407, -0.094833, 0.994873, -0.004579, -0.004288, 0.004389, 0.996104, 0.001109, 0.476725], + "std": [ 0.333621, 0.387175, 0.457140, 0.010807, 0.077802, 0.063386, 0.078571, 0.009994, 0.038504, 0.499460], + "min": [-0.937500, -0.937500, -0.937500, 0.902028, -0.356085, -0.367416, -0.370434, 0.921907, -0.255000, 0.000000], + "max": [ 0.937500, 0.937500, 0.937500, 1.000000, 0.368853, 0.341214, 0.356395, 1.000000, 0.348251, 1.000000], + "q01": [-0.723214, -0.808929, -0.937500, 0.934955, -0.223431, -0.189878, -0.334735, 0.938516, -0.107736, 0.000000], + "q99": [ 0.937500, 0.870536, 0.937500, 1.000000, 0.331000, 0.163153, 0.226216, 1.000000, 0.127158, 1.000000] + } +} diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka-dual_backward_framewise_rot6d.json b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka-dual_backward_framewise_rot6d.json new file mode 100644 index 0000000000000000000000000000000000000000..ddf3b6f43a5e7f49572306bb88ebb12ab7ffb130 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka-dual_backward_framewise_rot6d.json @@ -0,0 +1,33 @@ +{ + "metadata": { + "embodiment_type": "robomind-franka-dual", + "pose_convention": "backward_framewise", + "rotation_format": "rot6d", + "action_dim": 20, + "skip_rotation_dims": [3, 4, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18], + "chunk_length": 16, + "sample_stride": 16, + "dataset_name": "robomind_franka_dual_20260414", + "dataset_class": "RoboMINDFrankaDataset", + "dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228", + "split": "train", + "num_samples_stats": 21410, + "reservoir_size": 5000000 + }, + "global": { + "mean": [ 0.000231, 0.000179, -0.000319, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.638652, 0.000148, -0.000377, -0.000241, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.815273], + "std": [ 0.014881, 0.008081, 0.014371, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.464058, 0.010628, 0.005868, 0.007900, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.366049], + "min": [-0.115093, -0.096415, -0.112595, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000, -0.091252, -0.052148, -0.113650, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "max": [ 0.114941, 0.063433, 0.098721, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.123908, 0.077951, 0.080229, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.999628], + "q01": [-0.051367, -0.031964, -0.046482, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000, -0.035108, -0.021212, -0.029788, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "q99": [ 0.043729, 0.021737, 0.036738, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.047581, 0.021270, 0.025712, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.995443] + }, + "global_raw": { + "mean": [ 0.000231, 0.000179, -0.000319, 0.999350, 0.000318, -0.000135, -0.000286, 0.999586, -0.000051, 0.638652, 0.000148, -0.000377, -0.000241, 0.999294, 0.000343, 0.000431, -0.000147, 0.999580, -0.000570, 0.815273], + "std": [ 0.014881, 0.008081, 0.014371, 0.002235, 0.020196, 0.029781, 0.020185, 0.001125, 0.020484, 0.464058, 0.010628, 0.005868, 0.007900, 0.002664, 0.025404, 0.027550, 0.025193, 0.001657, 0.014210, 0.366049], + "min": [-0.115093, -0.096415, -0.112595, 0.944314, -0.271877, -0.325264, -0.254808, 0.962274, -0.227188, 0.000000, -0.091252, -0.052148, -0.113650, 0.941406, -0.265241, -0.273484, -0.290840, 0.954990, -0.264631, 0.000000], + "max": [ 0.114941, 0.063433, 0.098721, 1.000000, 0.258475, 0.270230, 0.271943, 1.000000, 0.221936, 1.000000, 0.123908, 0.077951, 0.080229, 1.000000, 0.296517, 0.333596, 0.269131, 1.000000, 0.139695, 0.999628], + "q01": [-0.051367, -0.031964, -0.046482, 0.988101, -0.053179, -0.128603, -0.075432, 0.994427, -0.059973, 0.000000, -0.035108, -0.021212, -0.029788, 0.986086, -0.098043, -0.111441, -0.093441, 0.991492, -0.058030, 0.000000], + "q99": [ 0.043729, 0.021737, 0.036738, 1.000000, 0.075612, 0.102791, 0.053223, 1.000000, 0.077057, 1.000000, 0.047581, 0.021270, 0.025712, 1.000000, 0.095525, 0.126049, 0.098778, 1.000000, 0.041914, 0.995443] + } +} diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka_backward_framewise_rot6d.json b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka_backward_framewise_rot6d.json new file mode 100644 index 0000000000000000000000000000000000000000..9f5d5bfc4a4db7e724ff2c321e84c793c210d0fc --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/normalizers/robomind-franka_backward_framewise_rot6d.json @@ -0,0 +1,33 @@ +{ + "metadata": { + "embodiment_type": "robomind-franka", + "pose_convention": "backward_framewise", + "rotation_format": "rot6d", + "action_dim": 10, + "skip_rotation_dims": [3, 4, 5, 6, 7, 8], + "chunk_length": 16, + "sample_stride": 16, + "dataset_name": "robomind_franka_20260414", + "dataset_class": "RoboMINDFrankaDataset", + "dataset_root": "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228", + "split": "train", + "num_samples_stats": 141658, + "reservoir_size": 5000000 + }, + "global": { + "mean": [ 0.000241, 0.000073, -0.000597, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.630501], + "std": [ 0.020545, 0.010725, 0.022054, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.434021], + "min": [-0.184377, -0.130924, -0.183947, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "max": [ 0.227682, 0.134118, 0.133222, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000], + "q01": [-0.065029, -0.030683, -0.075321, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, -1.000000, 0.000000], + "q99": [ 0.068546, 0.036309, 0.051772, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000] + }, + "global_raw": { + "mean": [ 0.000241, 0.000073, -0.000597, 0.998782, 0.000605, 0.000003, -0.000592, 0.998245, -0.000508, 0.630501], + "std": [ 0.020545, 0.010725, 0.022054, 0.004056, 0.043101, 0.023671, 0.043102, 0.004948, 0.040306, 0.434021], + "min": [-0.184377, -0.130924, -0.183947, 0.837403, -0.525301, -0.384252, -0.543663, 0.801190, -0.490979, 0.000000], + "max": [ 0.227682, 0.134118, 0.133222, 1.000000, 0.543800, 0.389145, 0.522029, 1.000000, 0.414190, 1.000000], + "q01": [-0.065029, -0.030683, -0.075321, 0.981664, -0.137429, -0.069593, -0.140220, 0.976885, -0.140399, 0.000000], + "q99": [ 0.068546, 0.036309, 0.051772, 1.000000, 0.140290, 0.079942, 0.137529, 1.000000, 0.113651, 1.000000] + } +} diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/pose_utils.py b/cosmos-framework/cosmos_framework/data/vfm/action/pose_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..12ea51be92ea82023aecec034f5248bcc3903bb3 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/pose_utils.py @@ -0,0 +1,747 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Rotation and pose utilities for action datasets. + +This module centralizes three related responsibilities used across the action +dataset stack: + +1. Converting rotations between the conventions used by the datasets and the + action model (`euler_xyz`, quaternion, axis-angle, rot6d, rot9d, matrix). +2. Building absolute homogeneous poses of shape ``(T, 4, 4)`` from per-frame + translation and rotation components. +3. Converting trajectories between absolute-pose form and the relative-pose + action vectors consumed by the datasets. + + The relative-pose action vectors always follow the shared layout + ``[translation(3), rotation(...)]``. The rotation block is encoded with the + requested rotation output convention, and `convert_rotation()` is the + canonical public entrypoint for representation conversion. +""" + +import math +from typing import Literal + +import numpy as np +import torch +from scipy.spatial.transform import Rotation as R + +PoseConvention = Literal["absolute", "backward_anchored", "backward_framewise"] +RotationConvention = Literal["matrix", "euler_xyz", "quat_xyzw", "quat_wxyz", "rot6d", "axisangle", "rot9d"] + + +def _to_numpy_float32(array: torch.Tensor | np.ndarray) -> np.ndarray: + """Convert an input array to a NumPy ``float32`` array. + + Args: + array: A torch tensor or NumPy array with arbitrary leading dimensions. + + Returns: + A NumPy array with dtype ``float32``. Torch tensors are moved to CPU + before conversion. NumPy inputs are converted with ``copy=False`` + semantics when possible. + + Raises: + ValueError: If a torch tensor with ``requires_grad=True`` is passed. + These utilities are non-differentiable; callers must explicitly + detach tensors before conversion. + """ + if isinstance(array, torch.Tensor): + if array.requires_grad: + raise ValueError( + "pose_utils conversion is non-differentiable; call `.detach()` " + "explicitly before passing tensors with requires_grad=True" + ) + return array.cpu().numpy().astype(np.float32, copy=False) + return np.asarray(array, dtype=np.float32) + + +def _normalize_rotation_matrices(rot_matrices: np.ndarray) -> np.ndarray: + """Project approximate matrices onto valid rotation matrices. + + This helper uses an SVD-based projection onto ``SO(3)``. It is mainly used + when decoding rotations from network-like representations such as rot6d or rot9d + where the input may not already be perfectly orthonormal. + + Args: + rot_matrices: Array of shape ``(..., 3, 3)`` containing one or more + approximate rotation matrices. + + Returns: + Array of shape ``(..., 3, 3)`` whose trailing matrices are proper + rotation matrices with determinant ``+1``. + + Raises: + ValueError: If the input does not have trailing shape ``(3, 3)``. + """ + matrices = np.asarray(rot_matrices, dtype=np.float32) + if matrices.ndim < 2 or matrices.shape[-2:] != (3, 3): + raise ValueError(f"Rotation matrices must have shape (..., 3, 3), got {matrices.shape}") + + original_shape = matrices.shape[:-2] + matrices_flat = matrices.reshape(-1, 3, 3) + + # Batched SVD projection to SO(3). + U, _, Vt = np.linalg.svd(matrices_flat) + normalized = U @ Vt + + # Ensure determinant is +1 (proper rotations, no reflections). + det = np.linalg.det(normalized) + reflection_mask = det < 0 + if np.any(reflection_mask): + U_reflect = U.copy() + U_reflect[reflection_mask, :, -1] *= -1 + normalized[reflection_mask] = U_reflect[reflection_mask] @ Vt[reflection_mask] + + return normalized.astype(np.float32, copy=False).reshape(*original_shape, 3, 3) + + +def convert_rotation( + rotation: torch.Tensor | np.ndarray, + input_format: RotationConvention, + output_format: RotationConvention, + normalize_matrix: bool = False, +) -> torch.Tensor | np.ndarray: + """Convert rotations between the conventions used by action datasets. + + The function first maps the input representation to rotation matrices and + then emits the requested output convention. It is the single conversion seam + used by the public pose helpers so that all code paths share the same + convention handling. + + Supported input conventions: + - ``matrix``: rotation matrices with shape ``(..., 3, 3)`` + - ``euler_xyz``: Euler xyz angles in radians with shape ``(..., 3)`` + - ``quat_xyzw``: quaternions in SciPy's xyzw order with shape ``(..., 4)`` + - ``quat_wxyz``: quaternions in wxyz order with shape ``(..., 4)`` + - ``rot6d``: column-based 6D representation with shape ``(..., 6)`` + - ``rot9d``: flattened rotation matrices with shape ``(..., 9)`` + - ``axisangle``: axis-angle vectors with shape ``(..., 3)`` + + Supported output conventions: + - ``matrix`` + - ``euler_xyz`` + - ``quat_xyzw`` + - ``quat_wxyz`` + - ``rot6d`` + - ``axisangle`` + - ``rot9d`` + + Args: + rotation: Input rotations in the representation specified by + ``input_format``. + input_format: Convention used by ``rotation``. + output_format: Convention to return. + normalize_matrix: Whether to project intermediate matrices to a valid + rotation before returning. This is most useful when decoding from + approximate ``rot6d``/``rot9d`` inputs or non-unit quaternions. + + Returns: + Rotations with the same leading shape as the input, expressed in the + requested output convention. Torch inputs return torch outputs on the + same device with the same dtype; NumPy inputs return NumPy arrays. + + Raises: + ValueError: If the input shape is incompatible with ``input_format`` or + if either format is unsupported. + """ + input_is_tensor = isinstance(rotation, torch.Tensor) + input_dtype = rotation.dtype if input_is_tensor else None + input_device = rotation.device if input_is_tensor else None + rotation_np = _to_numpy_float32(rotation) + + if input_format == "matrix": + if rotation_np.ndim < 2 or rotation_np.shape[-2:] != (3, 3): + raise ValueError(f"matrix rotation must have shape (..., 3, 3), got {rotation_np.shape}") + original_shape = rotation_np.shape[:-2] + matrices_flat = rotation_np.reshape(-1, 3, 3) + if normalize_matrix: + matrices_flat = _normalize_rotation_matrices(matrices_flat).reshape(-1, 3, 3) + elif input_format == "euler_xyz": + if rotation_np.ndim < 1 or rotation_np.shape[-1] != 3: + raise ValueError(f"{input_format} rotation must have shape (..., 3), got {rotation_np.shape}") + original_shape = rotation_np.shape[:-1] + matrices_flat = R.from_euler("xyz", rotation_np.reshape(-1, 3), degrees=False).as_matrix().astype(np.float32) + elif input_format in ("quat_xyzw", "quat_wxyz"): + if rotation_np.ndim < 1 or rotation_np.shape[-1] != 4: + raise ValueError(f"{input_format} rotation must have shape (..., 4), got {rotation_np.shape}") + original_shape = rotation_np.shape[:-1] + quaternions = rotation_np.reshape(-1, 4) + if input_format == "quat_wxyz": + quaternions = quaternions[:, [1, 2, 3, 0]] + norms = np.linalg.norm(quaternions, axis=-1) + if np.any(norms < 1e-8): + raise ValueError(f"Found zero-norm quaternion(s) (min norm={norms.min():.2e}).") + if normalize_matrix: + quaternions = quaternions / norms[:, None] + matrices_flat = R.from_quat(quaternions).as_matrix().astype(np.float32) + elif input_format == "rot6d": + if rotation_np.ndim < 1 or rotation_np.shape[-1] != 6: + raise ValueError(f"{input_format} rotation must have shape (..., 6), got {rotation_np.shape}") + original_shape = rotation_np.shape[:-1] + rot6d_flat = rotation_np.reshape(-1, 6) + col0 = rot6d_flat[:, :3] + col1 = rot6d_flat[:, 3:] + col2 = np.cross(col0, col1, axis=-1) + matrices_flat = np.stack((col0, col1, col2), axis=-1).astype(np.float32) + if normalize_matrix: + matrices_flat = _normalize_rotation_matrices(matrices_flat).reshape(-1, 3, 3) + elif input_format == "rot9d": + if rotation_np.ndim < 1 or rotation_np.shape[-1] != 9: + raise ValueError(f"rot9d rotation must have shape (..., 9), got {rotation_np.shape}") + original_shape = rotation_np.shape[:-1] + matrices_flat = rotation_np.reshape(-1, 3, 3) + if normalize_matrix: + matrices_flat = _normalize_rotation_matrices(matrices_flat).reshape(-1, 3, 3) + elif input_format == "axisangle": + if rotation_np.ndim < 1 or rotation_np.shape[-1] != 3: + raise ValueError(f"axisangle rotation must have shape (..., 3), got {rotation_np.shape}") + original_shape = rotation_np.shape[:-1] + matrices_flat = R.from_rotvec(rotation_np.reshape(-1, 3)).as_matrix().astype(np.float32) + else: + raise ValueError(f"Unsupported input_format: {input_format!r}") + + if output_format == "matrix": + converted = matrices_flat.reshape(*original_shape, 3, 3).astype(np.float32) + elif output_format == "rot9d": + converted = matrices_flat.reshape(-1, 9) + elif output_format == "rot6d": + converted = matrices_flat[:, :, :2].transpose(0, 2, 1).reshape(-1, 6) + elif output_format == "quat_xyzw": + converted = R.from_matrix(matrices_flat).as_quat().astype(np.float32) + elif output_format == "quat_wxyz": + converted = R.from_matrix(matrices_flat).as_quat().astype(np.float32) + converted = converted[:, [3, 0, 1, 2]] + elif output_format == "euler_xyz": + converted = R.from_matrix(matrices_flat).as_euler("xyz", degrees=False).astype(np.float32) + elif output_format == "axisangle": + converted = R.from_matrix(matrices_flat).as_rotvec().astype(np.float32) + else: + raise ValueError(f"Unsupported output_format: {output_format!r}") + + if output_format != "matrix": + converted = converted.reshape(*original_shape, converted.shape[-1]) + + if input_is_tensor: + return torch.from_numpy(np.ascontiguousarray(converted)).to(dtype=input_dtype, device=input_device) + return converted + + +# ----------------------------------------------------------------------------- +# Absolute pose construction +# ----------------------------------------------------------------------------- + + +def build_abs_pose_from_components( + xyz: torch.Tensor | np.ndarray, + rotation: torch.Tensor | np.ndarray, + rotation_input_format: Literal["euler_xyz", "quat_xyzw", "quat_wxyz", "axisangle"], + translation_scale: float | None = None, +) -> np.ndarray: + """Build absolute homogeneous poses from per-frame translation and rotation. + + This is the canonical helper for turning dataset-provided pose components + into a sequence of rigid transforms. Each output pose is a homogeneous + transform whose top-left ``3 x 3`` block stores rotation and whose last + column stores translation. + + Args: + xyz: Per-frame translations with shape ``(T, 3)``. + rotation: Per-frame rotations with shape ``(T, 3)`` for ``euler_xyz`` + and ``axisangle``, or ``(T, 4)`` for quaternion conventions. + rotation_input_format: Convention used by ``rotation``. Supported values + are ``euler_xyz``, ``quat_xyzw``, ``quat_wxyz``, and ``axisangle``. + translation_scale: Optional factor used to divide translations before + inserting them into the output poses. This is useful when upstream + data stores translations in a scaled unit. + + Returns: + Absolute poses with shape ``(T, 4, 4)`` and dtype ``float32``. + + Raises: + ValueError: If the translation and rotation arrays have incompatible + lengths or unsupported shapes, or if ``translation_scale`` is zero. + """ + xyz_np = _to_numpy_float32(xyz) + rotation_np = _to_numpy_float32(rotation) + + if xyz_np.ndim != 2 or xyz_np.shape[1] != 3: + raise ValueError(f"xyz must have shape (T, 3), got {xyz_np.shape}") + if rotation_np.ndim != 2: + raise ValueError(f"rotation must be 2D, got {rotation_np.shape}") + if rotation_np.shape[0] != xyz_np.shape[0]: + raise ValueError( + f"xyz and rotation must have the same length, got {xyz_np.shape[0]} and {rotation_np.shape[0]}" + ) + + rot_mats = np.asarray( + convert_rotation(rotation_np, input_format=rotation_input_format, output_format="matrix"), + dtype=np.float32, + ) + + if translation_scale is not None: + if translation_scale == 0: + raise ValueError("translation_scale must be non-zero") + xyz_np = xyz_np / float(translation_scale) + + poses_abs = np.eye(4, dtype=np.float32)[None].repeat(xyz_np.shape[0], axis=0) + poses_abs[:, :3, :3] = rot_mats.astype(np.float32) + poses_abs[:, :3, 3] = xyz_np + return poses_abs + + +# ----------------------------------------------------------------------------- +# Relative pose conversions +# ----------------------------------------------------------------------------- + + +def _delta_transform_to_pose_vector( + delta_T: np.ndarray, + rotation_output_format: RotationConvention, + translation_scale: float = 1.0, + rotation_scale: float = 1.0, +) -> np.ndarray: + """Encode a relative transform as an action vector. + + The shared action-vector layout is always ``[translation(3), rotation(...)]``. + The translation block is multiplied by ``translation_scale`` before concatenation, + and the rotation block is multiplied by ``rotation_scale``. + + Args: + delta_T: Relative transform of shape ``(4, 4)``. + rotation_output_format: Concrete convention used for the output rotation + block. + translation_scale: Scalar multiplier applied to the translation block. + rotation_scale: Scalar multiplier applied to the rotation block. Used to + match the loss scale of the rotation block to the translation block. + The decoder must divide by the same factor before reconstructing the + rotation matrix. + + Returns: + A ``float32`` action vector whose first three values are translation and + whose remaining values are the rotation in ``rotation_output_format``. + """ + delta_np = np.asarray(delta_T, dtype=np.float32) + if delta_np.shape != (4, 4): + raise ValueError(f"delta_T must have shape (4, 4), got {delta_np.shape}") + + translation = delta_np[:3, 3] * translation_scale + rotation = np.asarray( + convert_rotation(delta_np[:3, :3], input_format="matrix", output_format=rotation_output_format), + dtype=np.float32, + ) + rotation = rotation * rotation_scale + return np.concatenate([translation, rotation]).astype(np.float32) + + +def _pose_vector_to_delta_transform( + pose_vector: np.ndarray, + rotation_input_format: RotationConvention, + translation_scale: float, + normalize_rotation: bool, + rotation_scale: float = 1.0, +) -> np.ndarray: + """Decode an action vector back into a relative homogeneous transform. + + This is the inverse of `_delta_transform_to_pose_vector()` when the same + rotation convention and scale are used. + + Args: + pose_vector: Relative-pose action vector with layout + ``[translation(3), rotation(...)]``. + rotation_input_format: Concrete convention used by the rotation block. + translation_scale: Scalar used to undo the translation scaling applied during + encoding. + normalize_rotation: Whether to project the decoded rotation to a valid + matrix before assembling the transform. + rotation_scale: Scalar used to undo the rotation scaling applied during + encoding. Must match the value used by + `_delta_transform_to_pose_vector()`. + + Returns: + A relative homogeneous transform with shape ``(4, 4)`` and dtype + ``float32``. + """ + pose_vector_np = np.asarray(pose_vector, dtype=np.float32) + rotation_block = pose_vector_np[3:] / rotation_scale + + rotation_matrix = np.asarray( + convert_rotation( + rotation_block, + input_format=rotation_input_format, + output_format="matrix", + normalize_matrix=normalize_rotation, + ), + dtype=np.float32, + ) + + delta_T = np.eye(4, dtype=np.float32) + delta_T[:3, 3] = pose_vector_np[:3] / translation_scale + delta_T[:3, :3] = rotation_matrix + return delta_T + + +def _get_relative_delta_transform( + poses_abs: np.ndarray, + inv_poses_abs: np.ndarray, + frame_idx: int, + pose_convention: PoseConvention, +) -> np.ndarray: + """Compute one relative transform from an absolute-pose trajectory. + + Args: + poses_abs: Absolute poses of shape ``(T, 4, 4)``. + inv_poses_abs: Precomputed inverses of ``poses_abs`` with the same shape. + frame_idx: Index of the step to encode, in ``[0, T - 2]``. + pose_convention: Pose convention controlling which two poses + define the delta and whether it is framewise or anchored. + + Returns: + The relative transform ``delta_T`` with shape ``(4, 4)`` for the + requested step and convention. + """ + if pose_convention == "backward_framewise": + return inv_poses_abs[frame_idx] @ poses_abs[frame_idx + 1] + if pose_convention == "backward_anchored": + return inv_poses_abs[0] @ poses_abs[frame_idx + 1] + raise ValueError( + f"Unsupported pose_convention={pose_convention!r}. Expected one of: backward_framewise, backward_anchored." + ) + + +def _apply_relative_delta_transform( + current_pose: np.ndarray, + initial_pose: np.ndarray, + delta_T: np.ndarray, + pose_convention: PoseConvention, +) -> np.ndarray: + """Recover the next absolute pose from a decoded relative transform. + + Args: + current_pose: The current reconstructed pose for framewise modes. + initial_pose: The anchor pose used by anchored modes. + delta_T: Relative transform for the current step. + pose_convention: Pose convention controlling how ``delta_T`` + should be composed back into an absolute pose. + + Returns: + The next absolute pose with shape ``(4, 4)``. + """ + if pose_convention == "backward_framewise": + return current_pose @ delta_T + if pose_convention == "backward_anchored": + return initial_pose @ delta_T + raise ValueError( + f"Unsupported pose_convention={pose_convention!r}. Expected one of: backward_framewise, backward_anchored." + ) + + +def pose_abs_to_rel( + poses_abs: np.ndarray, + rotation_format: RotationConvention = "rot9d", + pose_convention: PoseConvention = "backward_framewise", + translation_scale: float = 1.0, + rotation_scale: float = 1.0, +) -> np.ndarray: + """Convert an absolute-pose trajectory into relative-pose action vectors. + + Args: + poses_abs: Absolute poses with shape ``(T, 4, 4)``. These are typically + object-in-world or camera-to-world transforms. + rotation_format: Rotation convention used for the output rotation block. + Supported values are ``rot9d``, ``rot6d``, ``quat_xyzw``, and + ``euler_xyz``. + pose_convention: Pose convention: + - ``backward_framewise``: ``delta_T = T_i^{-1} @ T_{i+1}`` + - ``backward_anchored``: ``delta_T = T_0^{-1} @ T_{i+1}`` + translation_scale: Scalar multiplier applied to the translation block of each + encoded action vector. + rotation_scale: Scalar multiplier applied to the rotation block of each + encoded action vector. Use this to match the loss scale of rotation + and translation. `pose_rel_to_abs()` must be called with the same + value to invert the scaling. + + Returns: + An array of shape ``(T - 1, D)`` where ``D = 3 + rotation_dim``. + + Raises: + AssertionError: If fewer than two absolute poses are provided. + """ + num_frames = len(poses_abs) + assert num_frames > 1, "At least 2 frames are required to compute relative poses" + + # Compute inverse poses + inv_poses_abs = np.linalg.inv(poses_abs) + + poses_rel = [] + # We produce num_frames - 1 relative poses + for i in range(num_frames - 1): + delta_T = _get_relative_delta_transform(poses_abs, inv_poses_abs, i, pose_convention) + poses_rel.append( + _delta_transform_to_pose_vector( + delta_T, + rotation_output_format=rotation_format, + translation_scale=translation_scale, + rotation_scale=rotation_scale, + ) + ) + + return np.stack(poses_rel).astype(np.float32) # [T-1,D] + + +def pose_rel_to_abs( + poses_rel: np.ndarray, + rotation_format: RotationConvention = "rot9d", + pose_convention: PoseConvention = "backward_framewise", + initial_pose: np.ndarray | None = None, + normalize_rotation: bool = True, + translation_scale: float = 1.0, + rotation_scale: float = 1.0, +) -> np.ndarray: + """Reconstruct an absolute-pose trajectory from relative-pose action vectors. + + Args: + poses_rel: Relative-pose action vectors with shape ``(T - 1, D)`` and + layout ``[translation(3), rotation(...)]``. + rotation_format: Convention used by the rotation block of ``poses_rel``. + pose_convention: Pose convention used when the vectors were + encoded. This must match the convention passed to `pose_abs_to_rel()`. + initial_pose: Absolute pose for the first frame. If ``None``, the + identity transform is used. + normalize_rotation: Whether to project decoded rotations onto ``SO(3)`` + before composing them back into the trajectory. + translation_scale: Scalar used to undo the translation scaling applied during + `pose_abs_to_rel()`. + rotation_scale: Scalar used to undo the rotation scaling applied during + `pose_abs_to_rel()`. Must match the value passed there. + + Returns: + Absolute poses with shape ``(T, 4, 4)`` where ``T = len(poses_rel) + 1``. + """ + if initial_pose is None: + initial_pose = np.eye(4) + + poses_abs = [initial_pose] + current_pose = initial_pose + + num_poses_rel = poses_rel.shape[0] + + for i in range(num_poses_rel): + delta_T = _pose_vector_to_delta_transform( + poses_rel[i], + rotation_input_format=rotation_format, + translation_scale=translation_scale, + normalize_rotation=normalize_rotation, + rotation_scale=rotation_scale, + ) + next_pose = _apply_relative_delta_transform(current_pose, initial_pose, delta_T, pose_convention) + + poses_abs.append(next_pose) + current_pose = next_pose + + return np.stack(poses_abs) # [T,4,4] + + +# ----------------------------------------------------------------------------- +# Idle-frame detection +# ----------------------------------------------------------------------------- + + +def _identity_rotation_vector(rotation_format: RotationConvention) -> np.ndarray: + """Return the identity-rotation vector for a given rotation convention. + + Used by :func:`compute_idle_frames` to test whether a rotation block is + close to "no rotation" in its current encoding. + """ + if rotation_format in ("matrix", "rot9d"): + return np.array([1, 0, 0, 0, 1, 0, 0, 0, 1], dtype=np.float32) + if rotation_format == "rot6d": + return np.array([1, 0, 0, 0, 1, 0], dtype=np.float32) + if rotation_format == "quat_xyzw": + return np.array([0, 0, 0, 1], dtype=np.float32) + if rotation_format == "quat_wxyz": + return np.array([1, 0, 0, 0], dtype=np.float32) + if rotation_format in ("euler_xyz", "axisangle"): + return np.array([0, 0, 0], dtype=np.float32) + raise ValueError(f"Unsupported rotation_format={rotation_format!r}") + + +def _rotation_angle_per_arm(rotations: np.ndarray, rotation_format: str) -> np.ndarray: + """Geodesic angle (rad) from identity for each arm at each frame. + + ``rotations`` has shape ``(T, n_arms, n_per_arm)``; the returned array has + shape ``(T, n_arms)``. The angle is rotation-format aware so a fixed + ``eps_r`` threshold has consistent geometric meaning across formats: + + - ``rot6d`` → reconstruct ``trace(R)`` in closed form from the two stored + columns ``a, b`` (already unit-orthogonal as they came from a valid + rotation matrix). The third column is ``a × b``, so + ``trace(R) = a[0] + b[1] + a[0]·b[1] - a[1]·b[0]``. + ``angle = arccos(clip((trace - 1) / 2, -1, 1))``. + - ``rot9d`` → reshape to ``(..., 3, 3)`` and use + ``trace(R) = R[0,0] + R[1,1] + R[2,2]``. + - ``quat_xyzw`` / ``quat_wxyz`` → ``angle = 2 · arccos(|q_w|)``; the + absolute value handles the double cover (``q`` and ``-q`` represent the + same rotation). + - ``axisangle`` → the magnitude of the axis-angle vector *is* the angle. + - ``euler_xyz`` → no closed-form angle; use ``‖euler‖`` as a conservative + upper bound (exact for single-axis rotations, an overestimate for + composed ones — fine for idle detection where small angles are the + regime of interest). + """ + if rotation_format == "rot6d": + a = rotations[..., :3] + b = rotations[..., 3:6] + trace = a[..., 0] + b[..., 1] + a[..., 0] * b[..., 1] - a[..., 1] * b[..., 0] + return np.arccos(np.clip((trace - 1.0) / 2.0, -1.0, 1.0)) + if rotation_format == "rot9d": + mat = rotations.reshape(*rotations.shape[:-1], 3, 3) + trace = mat[..., 0, 0] + mat[..., 1, 1] + mat[..., 2, 2] + return np.arccos(np.clip((trace - 1.0) / 2.0, -1.0, 1.0)) + if rotation_format in ("quat_xyzw", "quat_wxyz"): + qw = rotations[..., 3] if rotation_format == "quat_xyzw" else rotations[..., 0] + return 2.0 * np.arccos(np.clip(np.abs(qw), 0.0, 1.0)) + if rotation_format == "axisangle": + return np.linalg.norm(rotations, axis=-1) + if rotation_format == "euler_xyz": + # Exact for single-axis rotations, overestimate for composed ones — + # safe for idle thresholds since overestimation can only mark a frame + # as non-idle, never spuriously idle. + return np.linalg.norm(rotations, axis=-1) + raise ValueError(f"Unsupported rotation_format={rotation_format!r}") + + +def _consecutive_streaks(idle: np.ndarray, min_streak: int) -> np.ndarray: + """Zero out idle bits not belonging to a run of ``>= min_streak`` Trues. + + Pure-numpy two-pointer scan. ``min_streak <= 1`` is a no-op (returns the + input mask unchanged). + """ + if min_streak <= 1: + return idle + out = np.zeros_like(idle) + n = len(idle) + i = 0 + while i < n: + if not idle[i]: + i += 1 + continue + j = i + while j < n and idle[j]: + j += 1 + if j - i >= min_streak: + out[i:j] = True + i = j + return out + + +def compute_idle_frames( + action_raw: torch.Tensor | np.ndarray, + spec: "ActionSpec", # noqa: F821 — forward ref, real import is in action_spec.py + *, + eps_t: float = 1e-3, + eps_r: float = math.radians(5.0), + eps_g: float = 1e-2, + joint_threshold: float = 5e-4, + min_streak: int = 3, +) -> int: + """Count idle frames in a raw (un-normalized) action chunk. + + Idle detection runs per-DimType (driven by ``spec.types``); a frame is + *raw-idle* iff every relevant type group is idle on that frame, and + counts toward the final tally only if it belongs to a run of at least + ``min_streak`` consecutive raw-idle frames. The streak filter rejects + isolated low-motion frames (instantaneous slowdowns) which carry weak + physical meaning and add noise to the IdleFrames training signal. + + DimType branches: + + - ``POS`` → combined ``‖action[pos_idx]‖`` (L2 across all POS dims) + < ``eps_t``. For single-arm specs (3 dims) this is the standard ``‖t‖`` + check; for multi-arm specs the combined norm is slightly stricter than + a per-arm check. + - ``ROT`` → per-arm geodesic rotation angle (rad) from identity + < ``eps_r``. The angle is computed in a rotation-format aware way (see + :func:`_rotation_angle_per_arm`) so the threshold has consistent + geometric meaning regardless of the encoding. + - ``GRIPPER`` → ``max |action[t] - action[t-1]| < eps_g``. ``np.diff`` + with ``prepend=action[0]`` makes step 0 ``|0|`` (treated as "no change"); + with the streak filter this can no longer create a spurious single-frame + idle event. + - ``JOINT`` → same frame-diff scheme as gripper with + ``joint_threshold`` (rad / step). + - ``RESERVED`` → ignored. + + Defaults (in the units of the un-normalized action): + + - ``eps_t = 1e-3`` → 1 mm per-frame translation + - ``eps_r = 5°`` → 5° per-frame rotation (geodesic angle) + - ``eps_g = 1e-2`` → 1 % gripper command change + - ``joint_threshold = 5e-4`` → ~0.03° / step joint angle change + - ``min_streak = 3`` → require a run of >= 3 consecutive idle frames + + The input must be **un-normalized** so the identity transform sits at + known coordinates (translation ≈ 0, rotation ≈ identity). The action + vector is also assumed to be encoded in a per-step / framewise convention + (e.g. ``backward_framewise``); anchored conventions (``backward_anchored``) + accumulate over the chunk and would silently break the POS/ROT idle + checks. Callers (e.g. the LeRobot base class) gate on pose convention + before calling this function. + """ + if isinstance(action_raw, torch.Tensor): + action = action_raw.detach().cpu().numpy().astype(np.float32, copy=False) + else: + action = np.asarray(action_raw, dtype=np.float32) + + if action.ndim != 2: + raise ValueError(f"action_raw must be 2-D (T, D); got shape {action.shape}") + num_frames, action_dim = action.shape + if num_frames == 0: + return 0 + if action_dim != len(spec.types): + raise ValueError(f"action_dim={action_dim} does not match spec.dim={len(spec.types)}") + + # Import locally to avoid a circular import at module load time + # (action_spec.py imports RotationConvention from this file). + from cosmos_framework.data.vfm.action.action_spec import DimType + + pos_idx = [i for i, t in enumerate(spec.types) if t == DimType.POS] + rot_idx = [i for i, t in enumerate(spec.types) if t == DimType.ROT] + grip_idx = [i for i, t in enumerate(spec.types) if t == DimType.GRIPPER] + joint_idx = [i for i, t in enumerate(spec.types) if t == DimType.JOINT] + + idle = np.ones(num_frames, dtype=bool) + + # POS: combined L2 norm across all translation dims. + if pos_idx: + idle &= np.linalg.norm(action[:, pos_idx], axis=1) < eps_t + + # ROT: per-arm geodesic angle (rad). + if rot_idx: + rot_id = _identity_rotation_vector(spec.rotation_format) + n_per_arm = rot_id.shape[0] + if len(rot_idx) % n_per_arm != 0: + raise ValueError( + f"ROT dims ({len(rot_idx)}) not a multiple of " + f"rotation_format={spec.rotation_format!r} dim ({n_per_arm})" + ) + rotations = action[:, rot_idx].reshape(num_frames, -1, n_per_arm) + angles = _rotation_angle_per_arm(rotations, spec.rotation_format) # (T, n_arms) + idle &= angles.max(axis=1) < eps_r + + # GRIPPER: max |Δgripper| across all gripper dims; step 0's diff is 0. + if grip_idx: + gripper = action[:, grip_idx] + diff = np.abs(np.diff(gripper, axis=0, prepend=gripper[:1])) + idle &= diff.max(axis=1) < eps_g + + # JOINT: same frame-diff scheme with joint_threshold. + if joint_idx: + joints = action[:, joint_idx] + diff = np.abs(np.diff(joints, axis=0, prepend=joints[:1])) + idle &= diff.max(axis=1) < joint_threshold + + if min_streak > 1: + idle = _consecutive_streaks(idle, min_streak) + + return int(idle.sum()) diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/robomind_dataset_config.py b/cosmos-framework/cosmos_framework/data/vfm/action/robomind_dataset_config.py new file mode 100644 index 0000000000000000000000000000000000000000..f69bec0e7095063267b48a13b64222222fe72d96 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/robomind_dataset_config.py @@ -0,0 +1,333 @@ +FRANKA_LEROBOT_ROOTS = [ + "benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1", + "benchmark1_0_release/franka_3rgb/241021_insert_marker_1", + "benchmark1_0_release/franka_3rgb/241021_open_trash_bin_1", + "benchmark1_0_release/franka_3rgb/241021_remove_marker_1", + "benchmark1_0_release/franka_3rgb/241022_lamp_off_1", + "benchmark1_0_release/franka_3rgb/241022_lamp_on_1", + "benchmark1_0_release/franka_3rgb/241022_side_pull_close_drawer_1", + "benchmark1_0_release/franka_3rgb/241022_side_pull_open_drawer_1", + "benchmark1_0_release/franka_3rgb/241023_pick_pear_from_bowl_1", + "benchmark1_0_release/franka_3rgb/241023_pick_pear_from_bowl_2", + "benchmark1_0_release/franka_3rgb/241023_place_pear_in_bowl_1", + "benchmark1_0_release/franka_3rgb/241023_place_pear_in_bowl_2", + "benchmark1_0_release/franka_3rgb/blue_cub_on_pink", + "benchmark1_0_release/franka_3rgb/cap_close_dustbin", + "benchmark1_0_release/franka_3rgb/close_cap_lid", + "benchmark1_0_release/franka_3rgb/close_cap_tool_box", + "benchmark1_0_release/franka_3rgb/close_cap_trash_can", + "benchmark1_0_release/franka_3rgb/close_cap_trash_can_2", + "benchmark1_0_release/franka_3rgb/close_drawer", + "benchmark1_0_release/franka_3rgb/close_trash", + "benchmark1_0_release/franka_3rgb/open_cap_lid", + "benchmark1_0_release/franka_3rgb/open_cap_tool_box", + "benchmark1_0_release/franka_3rgb/open_cap_trash_can_1", + "benchmark1_0_release/franka_3rgb/open_cap_trash_can_2", + "benchmark1_0_release/franka_3rgb/open_drawer", + "benchmark1_0_release/franka_3rgb/open_the_drawer", + "benchmark1_0_release/franka_3rgb/open_trash", + "benchmark1_0_release/franka_3rgb/pick_apple_into_chest", + "benchmark1_0_release/franka_3rgb/pick_bread_desk", + "benchmark1_0_release/franka_3rgb/pick_bread_into_plate", + "benchmark1_0_release/franka_3rgb/pick_drawer_tool", + "benchmark1_0_release/franka_3rgb/pick_plate_from_plate_rack", + "benchmark1_0_release/franka_3rgb/pick_up_strawberry_from_bowl", + "benchmark1_0_release/franka_3rgb/pick_up_strawberry_in_bowl", + "benchmark1_0_release/franka_3rgb/piled_on_stack_blue_block_on_pink_block", + "benchmark1_0_release/franka_3rgb/piled_on_yellow_block_on_purple_block", + "benchmark1_0_release/franka_3rgb/place_in_block_1", + "benchmark1_0_release/franka_3rgb/place_in_block_in_plate_1", + "benchmark1_0_release/franka_3rgb/place_in_block_on_table", + "benchmark1_0_release/franka_3rgb/place_in_block_tennis_ball", + "benchmark1_0_release/franka_3rgb/place_in_bread_in_basket", + "benchmark1_0_release/franka_3rgb/place_in_bread_in_basket_1", + "benchmark1_0_release/franka_3rgb/place_in_bread_in_bread_machine", + "benchmark1_0_release/franka_3rgb/place_in_bread_in_plate", + "benchmark1_0_release/franka_3rgb/place_in_bread_on_plate_1", + "benchmark1_0_release/franka_3rgb/place_in_bread_on_plate_2", + "benchmark1_0_release/franka_3rgb/place_in_bread_on_table", + "benchmark1_0_release/franka_3rgb/place_in_bread_on_table_1", + "benchmark1_0_release/franka_3rgb/place_in_bread_on_table_2", + "benchmark1_0_release/franka_3rgb/place_in_cylinder", + "benchmark1_0_release/franka_3rgb/place_in_fruit", + "benchmark1_0_release/franka_3rgb/place_in_fruit_bread", + "benchmark1_0_release/franka_3rgb/place_in_fruit_in_basket", + "benchmark1_0_release/franka_3rgb/place_in_fruit_in_fruit_basket", + "benchmark1_0_release/franka_3rgb/place_in_fruit_on_table", + "benchmark1_0_release/franka_3rgb/place_in_pick_up_and_throw_away_1", + "benchmark1_0_release/franka_3rgb/place_in_purple_block_in_plate", + "benchmark1_0_release/franka_3rgb/place_in_purple_block_on_table", + "benchmark1_0_release/franka_3rgb/place_in_rectangular_prism", + "benchmark1_0_release/franka_3rgb/place_in_shape", + "benchmark1_0_release/franka_3rgb/place_in_take_bread_and_put_in_plate", + "benchmark1_0_release/franka_3rgb/place_in_toy", + "benchmark1_0_release/franka_3rgb/place_in_trash", + "benchmark1_0_release/franka_3rgb/place_in_yellow_block_on_table", + "benchmark1_0_release/franka_3rgb/place_plate_in_plate_rack", + "benchmark1_0_release/franka_3rgb/place_trash", + "benchmark1_0_release/franka_3rgb/rotate_close_cabinet", + "benchmark1_0_release/franka_3rgb/rotate_open_cabinet", + "benchmark1_0_release/franka_3rgb/slide_close_cabinet", + "benchmark1_0_release/franka_3rgb/slide_close_drawer", + "benchmark1_0_release/franka_3rgb/slide_close_drawer_1", + "benchmark1_0_release/franka_3rgb/slide_close_drawer_1_1", + "benchmark1_0_release/franka_3rgb/slide_open_cabinet", + "benchmark1_0_release/franka_3rgb/slide_open_drawer", + "benchmark1_0_release/franka_3rgb/slide_open_drawer_1", + "benchmark1_0_release/franka_3rgb/stick_target_blue_on_the_pink_obejct", + "benchmark1_0_release/franka_3rgb/twist_knob_start_bread_machine", + "benchmark1_1_release/franka_3rgb/apples_placed_on_a_ceramic_plate", + "benchmark1_1_release/franka_3rgb/bananas_placed_on_a_ceramic_plate", + "benchmark1_1_release/franka_3rgb/bread_is_placed_on_a_ceramic_plate", + "benchmark1_1_release/franka_3rgb/chili_peppers_are_placed_on_a_ceramic_plate", + "benchmark1_1_release/franka_3rgb/chili_peppers_are_placed_on_the_right_side_of_a_plastic_tray", + "benchmark1_1_release/franka_3rgb/close_garbage_bin", + "benchmark1_1_release/franka_3rgb/close_lid", + "benchmark1_1_release/franka_3rgb/cucumber_placed_on_a_ceramic_plate", + "benchmark1_1_release/franka_3rgb/flip_marco_cup", + "benchmark1_1_release/franka_3rgb/mobile_marco_cup", + "benchmark1_1_release/franka_3rgb/open_lid", + "benchmark1_1_release/franka_3rgb/place_marker", + "benchmark1_1_release/franka_3rgb/put_potatoes_on_a_ceramic_plate", + "benchmark1_1_release/franka_3rgb/put_the_bread_on_the_table", + "benchmark1_1_release/franka_3rgb/put_the_cucumber_on_the_left_side_of_the_bowl", + "benchmark1_1_release/franka_3rgb/put_the_red_apple_in_the_bowl", + "benchmark1_1_release/franka_3rgb/put_the_steamed_buns_on_a_ceramic_plate", + "benchmark1_1_release/franka_3rgb/red_apple_placed_in_the_center_of_the_desktop", + "benchmark1_1_release/franka_3rgb/side_opening_drawer", + "benchmark1_1_release/franka_3rgb/side_pull_drawer", + "benchmark1_1_release/franka_3rgb/strawberries_on_a_ceramic_plate", + "benchmark1_1_release/franka_3rgb/take_marker_pen", + "benchmark1_1_release/franka_3rgb/turn_off_lamp", + "benchmark1_1_release/franka_3rgb/turn_on_desk_lamp", + "benchmark1_1_release/franka_3rgb/yellow_bananas_placed_on_a_plastic_tray", + "benchmark1_1_release/franka_3rgb/yellow_square_placed_on_ceramic_plate", + "benchmark1_2_release/franka_3rgb/241223_upright_cup", +] + +FRANKA_DUAL_LEROBOT_ROOTS = [ + "benchmark1_1_release/franka_fr3_dual/both_pour_water", + "benchmark1_1_release/franka_fr3_dual/left_place_gray_plate_from_low_rack_on_left_of_table", + "benchmark1_1_release/franka_fr3_dual/left_place_gray_plate_on_lower_rack", + "benchmark1_1_release/franka_fr3_dual/left_push_plate_to_right_of_table", + "benchmark1_1_release/franka_fr3_dual/right_slide_close_drawer", + "benchmark1_1_release/franka_fr3_dual/right_slide_open_drawer", +] + +UR_LEROBOT_ROOTS = [ + "benchmark1_0_release/ur_1rgb/bread_in_basket_1", + "benchmark1_0_release/ur_1rgb/bread_in_basket_old", + "benchmark1_0_release/ur_1rgb/bread_on_table", + "benchmark1_0_release/ur_1rgb/close_top_drawer", + "benchmark1_0_release/ur_1rgb/close_top_white_drawer", + "benchmark1_0_release/ur_1rgb/close_trash_can", + "benchmark1_0_release/ur_1rgb/cover_pot_lid", + "benchmark1_0_release/ur_1rgb/green_pepper_in_basket", + "benchmark1_0_release/ur_1rgb/green_pepper_in_basket_1", + "benchmark1_0_release/ur_1rgb/green_pepper_on_table", + "benchmark1_0_release/ur_1rgb/open_pot_lid", + "benchmark1_0_release/ur_1rgb/open_top_drawer", + "benchmark1_0_release/ur_1rgb/open_top_white_drawer", + "benchmark1_0_release/ur_1rgb/open_trash_can", + "benchmark1_0_release/ur_1rgb/pick_up_banana", + "benchmark1_0_release/ur_1rgb/pick_up_bread", + "benchmark1_0_release/ur_1rgb/pick_up_bread_slice", + "benchmark1_0_release/ur_1rgb/pick_up_can", + "benchmark1_0_release/ur_1rgb/pick_up_donut", + "benchmark1_0_release/ur_1rgb/pick_up_egg", + "benchmark1_0_release/ur_1rgb/pick_up_green_onion", + "benchmark1_0_release/ur_1rgb/pick_up_green_pepper", + "benchmark1_0_release/ur_1rgb/pick_up_long_bread", + "benchmark1_0_release/ur_1rgb/pick_up_mangosteen", + "benchmark1_0_release/ur_1rgb/pick_up_paper_ball", + "benchmark1_0_release/ur_1rgb/pick_up_pear", + "benchmark1_0_release/ur_1rgb/pick_up_plastic_bottle", + "benchmark1_0_release/ur_1rgb/pick_up_pot_lid", + "benchmark1_0_release/ur_1rgb/pick_up_red_pepper", + "benchmark1_0_release/ur_1rgb/pick_up_round_bread", + "benchmark1_0_release/ur_1rgb/pick_up_round_bread_1", + "benchmark1_0_release/ur_1rgb/pick_up_square_bread", + "benchmark1_0_release/ur_1rgb/pick_up_toast", + "benchmark1_0_release/ur_1rgb/pick_up_triangle_bread", + "benchmark1_0_release/ur_1rgb/pick_up_yellow_pepper", + "benchmark1_0_release/ur_1rgb/put_banana_in_top_drawer", + "benchmark1_0_release/ur_1rgb/put_bread_in_pot", + "benchmark1_0_release/ur_1rgb/put_bread_slice_in_pot", + "benchmark1_0_release/ur_1rgb/put_bread_slice_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_can_in_trash_can", + "benchmark1_0_release/ur_1rgb/put_donut_in_pot", + "benchmark1_0_release/ur_1rgb/put_donut_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_egg_in_pot", + "benchmark1_0_release/ur_1rgb/put_egg_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_green_onion_in_pot", + "benchmark1_0_release/ur_1rgb/put_green_pepper_in_pot", + "benchmark1_0_release/ur_1rgb/put_green_pepper_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_long_bread_in_pot", + "benchmark1_0_release/ur_1rgb/put_long_bread_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_long_bread_in_trash_can", + "benchmark1_0_release/ur_1rgb/put_mangosteen_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_paper_ball_in_trash_can", + "benchmark1_0_release/ur_1rgb/put_pear_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_plastic_bottle_in_trash_can", + "benchmark1_0_release/ur_1rgb/put_pot_lid_on_table", + "benchmark1_0_release/ur_1rgb/put_red_pepper_in_pot", + "benchmark1_0_release/ur_1rgb/put_red_pepper_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_round_bread_in_pot", + "benchmark1_0_release/ur_1rgb/put_round_bread_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_round_bread_in_trash_can", + "benchmark1_0_release/ur_1rgb/put_square_bread_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_square_bread_in_trash_can", + "benchmark1_0_release/ur_1rgb/put_toast_in_pot", + "benchmark1_0_release/ur_1rgb/put_toast_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_triangle_bread_in_pot", + "benchmark1_0_release/ur_1rgb/put_triangle_bread_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/put_yellow_pepper_in_pot", + "benchmark1_0_release/ur_1rgb/put_yellow_pepper_in_top_drawer", + "benchmark1_0_release/ur_1rgb/put_yellow_pepper_in_top_white_drawer", + "benchmark1_0_release/ur_1rgb/red_pepper_in_basket", + "benchmark1_0_release/ur_1rgb/red_pepper_on_table", + "benchmark1_0_release/ur_1rgb/triangle_bread_in_basket", + "benchmark1_0_release/ur_1rgb/triangle_bread_in_basket_1", + "benchmark1_0_release/ur_1rgb/triangle_bread_on_table", + "benchmark1_0_release/ur_1rgb/yellow_pepper_in_basket", + "benchmark1_0_release/ur_1rgb/yellow_pepper_in_basket_1", + "benchmark1_0_release/ur_1rgb/yellow_pepper_on_table", + "benchmark1_1_release/ur_1rgb/green_pepper_on_plate", + "benchmark1_1_release/ur_1rgb/green_pepper_on_the_table", + "benchmark1_1_release/ur_1rgb/insert_the_flowers_from_the_vase_1025", + "benchmark1_1_release/ur_1rgb/insert_the_flowers_from_the_vase_1028", + "benchmark1_1_release/ur_1rgb/insert_the_flowers_from_the_vase_1029", + "benchmark1_1_release/ur_1rgb/insert_the_flowers_from_the_vase_1030", + "benchmark1_1_release/ur_1rgb/insert_the_flowers_from_the_vase_1031", + "benchmark1_1_release/ur_1rgb/insert_the_flowers_from_the_vase_1101", + "benchmark1_1_release/ur_1rgb/insert_the_flowers_from_the_vase_1126", + "benchmark1_1_release/ur_1rgb/pick_up_green_pepper_from_plate", + "benchmark1_1_release/ur_1rgb/pick_up_green_pepper_from_table", + "benchmark1_1_release/ur_1rgb/pick_up_red_pepper_from_plate", + "benchmark1_1_release/ur_1rgb/pick_up_red_pepper_from_table", + "benchmark1_1_release/ur_1rgb/pick_up_the_cucumber_from_the_basket_1125", + "benchmark1_1_release/ur_1rgb/pick_up_the_cucumber_from_the_table_1125", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_table_1025", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_table_1028", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_table_1029", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_table_1030", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_table_1031", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_table_1101", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_table_1126", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_vase_1025", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_vase_1028", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_vase_1029", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_vase_1030", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_vase_1031", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_vase_1101", + "benchmark1_1_release/ur_1rgb/pick_up_the_flowers_from_the_vase_1126", + "benchmark1_1_release/ur_1rgb/pick_up_the_green_vegetable_from_the_basket_1121", + "benchmark1_1_release/ur_1rgb/pick_up_the_green_vegetable_from_the_table_1121", + "benchmark1_1_release/ur_1rgb/pick_up_the_mango_from_the_table_1104", + "benchmark1_1_release/ur_1rgb/pick_up_the_mango_from_the_table_1120", + "benchmark1_1_release/ur_1rgb/pick_up_the_mangoes_from_the_basket_1104", + "benchmark1_1_release/ur_1rgb/pick_up_the_mangoes_from_the_basket_1120", + "benchmark1_1_release/ur_1rgb/pick_up_the_oranges_from_the_basket_1119", + "benchmark1_1_release/ur_1rgb/pick_up_the_oranges_from_the_table_1119", + "benchmark1_1_release/ur_1rgb/pick_up_the_sweet_potato_from_the_basket_1122", + "benchmark1_1_release/ur_1rgb/pick_up_the_sweet_potato_from_the_table_1122", + "benchmark1_1_release/ur_1rgb/pick_up_the_yellow_peppers_from_the_basket_1101", + "benchmark1_1_release/ur_1rgb/pick_up_the_yellow_peppers_from_the_table_1101", + "benchmark1_1_release/ur_1rgb/pick_up_yellow_pepper_from_plate", + "benchmark1_1_release/ur_1rgb/pick_up_yellow_pepper_from_plate_copy_1734079773826", + "benchmark1_1_release/ur_1rgb/pick_up_yellow_pepper_from_table", + "benchmark1_1_release/ur_1rgb/pick_up_yellow_pepper_from_table_copy_1734079574938", + "benchmark1_1_release/ur_1rgb/pickupthebananafromtheplate", + "benchmark1_1_release/ur_1rgb/pickupthebananafromthetable", + "benchmark1_1_release/ur_1rgb/place_the_cucumber_on_the_table_1125", + "benchmark1_1_release/ur_1rgb/place_the_flowers_on_the_table_1025", + "benchmark1_1_release/ur_1rgb/place_the_flowers_on_the_table_1028", + "benchmark1_1_release/ur_1rgb/place_the_flowers_on_the_table_1029", + "benchmark1_1_release/ur_1rgb/place_the_flowers_on_the_table_1030", + "benchmark1_1_release/ur_1rgb/place_the_flowers_on_the_table_1031", + "benchmark1_1_release/ur_1rgb/place_the_flowers_on_the_table_1101", + "benchmark1_1_release/ur_1rgb/place_the_flowers_on_the_table_1126", + "benchmark1_1_release/ur_1rgb/place_the_green_vegetable_on_the_table_1121", + "benchmark1_1_release/ur_1rgb/place_the_mango_on_the_table_1104", + "benchmark1_1_release/ur_1rgb/place_the_mango_on_the_table_1120", + "benchmark1_1_release/ur_1rgb/place_the_oranges_on_the_table_1119", + "benchmark1_1_release/ur_1rgb/place_the_sweet_potato_on_the_table_1122", + "benchmark1_1_release/ur_1rgb/placebananaonaplate", + "benchmark1_1_release/ur_1rgb/put_the_cucumber_in_the_basket_1125", + "benchmark1_1_release/ur_1rgb/put_the_garbage_in_the_trash_can_1105", + "benchmark1_1_release/ur_1rgb/put_the_garbage_in_the_trash_can_1106", + "benchmark1_1_release/ur_1rgb/put_the_garbage_in_the_trash_can_1107", + "benchmark1_1_release/ur_1rgb/put_the_garbage_in_the_trash_can_1111", + "benchmark1_1_release/ur_1rgb/put_the_garbage_in_the_trash_can_1112", + "benchmark1_1_release/ur_1rgb/put_the_garbage_in_the_trash_can_1113", + "benchmark1_1_release/ur_1rgb/put_the_garbage_in_the_trash_can_1114", + "benchmark1_1_release/ur_1rgb/put_the_garbage_in_the_trash_can_1118", + "benchmark1_1_release/ur_1rgb/put_the_green_vegetable_in_the_basket_1121", + "benchmark1_1_release/ur_1rgb/put_the_mango_in_the_basket_1104", + "benchmark1_1_release/ur_1rgb/put_the_mango_in_the_basket_1120", + "benchmark1_1_release/ur_1rgb/put_the_oranges_in_the_basket_1119", + "benchmark1_1_release/ur_1rgb/put_the_sweet_potato_in_the_basket_1122", + "benchmark1_1_release/ur_1rgb/putthebananaonthetable", + "benchmark1_1_release/ur_1rgb/red_pepper_in_table", + "benchmark1_1_release/ur_1rgb/red_pepper_on_plate", + "benchmark1_1_release/ur_1rgb/uncap_open_the_trash_can_1104", + "benchmark1_1_release/ur_1rgb/uncap_open_the_trash_can_1105", + "benchmark1_1_release/ur_1rgb/uncap_open_the_trash_can_1106", + "benchmark1_1_release/ur_1rgb/uncap_open_the_trash_can_1107", + "benchmark1_1_release/ur_1rgb/uncap_open_the_trash_can_1108", + "benchmark1_1_release/ur_1rgb/uncap_open_the_trash_can_1111", + "benchmark1_1_release/ur_1rgb/uncap_open_the_trash_can_1112", + "benchmark1_1_release/ur_1rgb/uncap_open_the_trash_can_1113", + "benchmark1_1_release/ur_1rgb/uncap_open_the_trash_can_1114", + "benchmark1_1_release/ur_1rgb/uncap_open_the_trash_can_1118", + "benchmark1_1_release/ur_1rgb/yellow_pepper_on_plate", + "benchmark1_1_release/ur_1rgb/yellow_pepper_on_plate_copy_1734079761061", +] + +LEROBOT_ROOTS = { + "franka": FRANKA_LEROBOT_ROOTS, + "franka-dual": FRANKA_DUAL_LEROBOT_ROOTS, + "ur": UR_LEROBOT_ROOTS, +} + +FRANKA_OBSERVATION_FEATURES = ["observation.images.camera_top", "observation.states.end_effector"] + +FRANKA_DUAL_OBSERVATION_FEATURES = ["observation.images.camera_front", "observation.states.end_effector"] + +UR_OBSERVATION_FEATURES = ["observation.images.camera_top"] + +OBSERVATION_FEATURES = { + "franka": FRANKA_OBSERVATION_FEATURES, + "franka-dual": FRANKA_DUAL_OBSERVATION_FEATURES, + "ur": UR_OBSERVATION_FEATURES, +} + +# All available camera keys per embodiment (used by concat_view mode). +FRANKA_ALL_CAMERA_KEYS = [ + "observation.images.camera_top", + "observation.images.camera_left", + "observation.images.camera_right", +] + +FRANKA_DUAL_ALL_CAMERA_KEYS = [ + "observation.images.camera_front", + "observation.images.camera_left", + "observation.images.camera_right", +] + +ALL_CAMERA_KEYS = { + "franka": FRANKA_ALL_CAMERA_KEYS, + "franka-dual": FRANKA_DUAL_ALL_CAMERA_KEYS, +} + +FRANKA_ACTION_FEATURES = ["actions.joint_position"] + +FRANKA_DUAL_ACTION_FEATURES = ["actions.joint_position"] + +UR_ACTION_FEATURES = ["actions.joint_position"] + +ACTION_FEATURES = { + "franka": FRANKA_ACTION_FEATURES, + "franka-dual": FRANKA_DUAL_ACTION_FEATURES, + "ur": UR_ACTION_FEATURES, +} diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/robomind_franka_dataset.py b/cosmos-framework/cosmos_framework/data/vfm/action/robomind_franka_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..1a1a274cba61be97c44c55473a60a4bfe2b1166d --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/robomind_franka_dataset.py @@ -0,0 +1,296 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# +# This codebase constitutes NVIDIA proprietary technology and is strictly +# confidential. Any unauthorized reproduction, distribution, or disclosure +# of this code, in whole or in part, outside NVIDIA is strictly prohibited +# without prior written consent. +# +# For inquiries regarding the use of this code in other NVIDIA proprietary +# projects, please contact the Deep Imagination Research Team at +# dir@exchange.nvidia.com. +# ----------------------------------------------------------------------------- + +"""RoboMIND Franka datasets for single-arm and dual-arm embodiments.""" + +from __future__ import annotations + +import math +import os +from typing import Any, cast + +import numpy as np +import torch +import torch.nn.functional as F + +from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import ( + ActionNormalization, + ActionSpec, + BaseActionLeRobotDataset, + Gripper, + Pos, + Rot, + build_action_spec, +) +from cosmos_framework.data.vfm.action.pose_utils import ( + PoseConvention, + build_abs_pose_from_components, + pose_abs_to_rel, +) +from cosmos_framework.data.vfm.action.robomind_dataset_config import ( + ACTION_FEATURES, + ALL_CAMERA_KEYS, + LEROBOT_ROOTS, + OBSERVATION_FEATURES, +) +from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint + +_ROBOMIND_FRANKA_TO_OPENCV: np.ndarray = np.array( + [ + [0.0, -1.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 1.0], + ], + dtype=np.float32, +) + +FrankaInitialPose = torch.Tensor | tuple[torch.Tensor, torch.Tensor] + + +class RoboMINDFrankaDataset(BaseActionLeRobotDataset): + """RoboMIND dataset for Franka single-arm and dual-arm embodiments.""" + + SUPPORTED_EMBODIMENTS: tuple[str, str] = ("robomind-franka", "robomind-franka-dual") + + # RoboMIND-Franka has ~3x faster motion than the typical teleoperation + # datasets (bridge / DROID / fractal). Empirically (see + # ``debug/idle_test/recommend_thresholds_norm.txt``) the slowest 1 % of + # motion sits at ~22 mm/s for single-arm and ~15 mm/s for dual-arm. + # + # **Dual-arm caveat**: the dual-arm dataset frequently has one arm's + # state recorded as a near-zero stutter throughout a chunk (data quality + # issue — only one arm is actually being teleoperated). Because the POS + # branch uses the combined L2 across both arms, the threshold then + # effectively becomes a per-arm threshold for whichever arm is active. + # We compensate by tightening dual-arm to the global default (5 mm/s, + # 1.5°/s) so a single arm doing a slow approach (~1mm/f at 10 Hz) is no + # longer classified as idle. + # + # Class defaults below match single-arm. Dual-arm overrides at instance + # construction (see ``__init__``). + _IDLE_EPS_T_SINGLE: float = 22e-3 + _IDLE_EPS_R_SINGLE: float = math.radians(3.0) + _IDLE_EPS_T_DUAL: float = 5e-3 # = base default; tight enough + _IDLE_EPS_R_DUAL: float = math.radians(1.5) # for "single-arm-slow" cases + idle_eps_t_per_sec: float = _IDLE_EPS_T_SINGLE + idle_eps_r_per_sec: float = _IDLE_EPS_R_SINGLE + + def __init__( + self, + root: str = "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets/RoboMIND_20251228", + fps: float = 10.0, + chunk_length: int = 16, + split_seed: int = 42, + split_val_ratio: float = 0.05, + split: str = "train", + mode: str = "policy", + embodiment_type: str = "robomind-franka", + pose_convention: str = "backward_framewise", + action_normalization: ActionNormalization | None = None, + viewpoint: Viewpoint = "concat_view", + enable_fast_init: bool = False, + ) -> None: + if embodiment_type not in self.SUPPORTED_EMBODIMENTS: + raise ValueError( + f"RoboMINDFrankaDataset only supports {self.SUPPORTED_EMBODIMENTS}, " + f"got embodiment_type={embodiment_type!r}" + ) + + super().__init__( + fps=fps, + chunk_length=chunk_length, + split_seed=split_seed, + split_val_ratio=split_val_ratio, + split=split, + mode=mode, + embodiment_type=embodiment_type, + viewpoint=viewpoint, + pose_convention=pose_convention, + rotation_format="rot6d", + action_normalization=action_normalization, + tolerance_s=1e-4, + enable_fast_init=enable_fast_init, + ) + + self._to_opencv: np.ndarray = _ROBOMIND_FRANKA_TO_OPENCV[:3, :3] + self._is_concat_view: bool = viewpoint == "concat_view" + + # Per-embodiment idle thresholds (instance-level override of the + # class default which matches single-arm). Dual-arm tightens both + # eps_t and eps_r to reflect its smaller per-frame motion tail. + if embodiment_type == "robomind-franka-dual": + self.idle_eps_t_per_sec = self._IDLE_EPS_T_DUAL + self.idle_eps_r_per_sec = self._IDLE_EPS_R_DUAL + + embodiment_key = embodiment_type.removeprefix("robomind-") + lerobot_roots = LEROBOT_ROOTS[embodiment_key] + if root.rstrip("/").endswith("RoboMIND_20251228"): + # Full RoboMIND root: expand configured task shards. For the + # packaged Space demo we point root directly at a single LeRobot + # task shard, so keep that root as-is. + pass + else: + lerobot_roots = [""] + observation_features = list(OBSERVATION_FEATURES[embodiment_key]) + action_features = ACTION_FEATURES[embodiment_key] + + if self._is_concat_view and embodiment_key in ALL_CAMERA_KEYS: + for cam_key in ALL_CAMERA_KEYS[embodiment_key]: + if cam_key not in observation_features: + observation_features.append(cam_key) + + self._all_shard_roots = [os.path.join(root, shard_root) if shard_root else root for shard_root in lerobot_roots] + self._delta_timestamps = { + **{key: [i * self._dt for i in range(0, self._chunk_length + 1)] for key in observation_features}, + **{key: [i * self._dt for i in range(0, self._chunk_length)] for key in action_features}, + } + + def _build_relative_poses( + self, + positions: torch.Tensor | np.ndarray, + euler_xyz: torch.Tensor | np.ndarray, + ) -> tuple[np.ndarray, torch.Tensor]: + poses_abs = build_abs_pose_from_components(positions, euler_xyz, "euler_xyz") + poses_abs[:, :3, :3] = poses_abs[:, :3, :3] @ self._to_opencv + initial_pose = torch.from_numpy(poses_abs[0].copy()).float() + pose_convention = cast(PoseConvention, self._pose_convention) + poses_rel = cast( + np.ndarray, pose_abs_to_rel(poses_abs, rotation_format="rot6d", pose_convention=pose_convention) + ) + return poses_rel, initial_pose + + def _build_action(self, sample: dict[str, Any]) -> tuple[torch.Tensor, FrankaInitialPose]: + state = sample["observation.states.end_effector"] + gripper = sample["actions.joint_position"] + + if self._embodiment_type == "robomind-franka": + poses_rel, initial_pose = self._build_relative_poses(state[:, 0:3], state[:, 3:6]) + action = torch.cat( + [ + torch.from_numpy(poses_rel).float(), + 1.0 - gripper[:, [7]], + ], + dim=-1, + ) # [T, 10] + return action, initial_pose + + poses_rel_left, initial_pose_left = self._build_relative_poses(state[:, 0:3], state[:, 3:6]) + poses_rel_right, initial_pose_right = self._build_relative_poses(state[:, 6:9], state[:, 9:12]) + action = torch.cat( + [ + torch.from_numpy(poses_rel_left).float(), + 1.0 - gripper[:, [7]], + torch.from_numpy(poses_rel_right).float(), + 1.0 - gripper[:, [15]], + ], + dim=-1, + ) # [T, 20] + return action, (initial_pose_left, initial_pose_right) + + def _compose_multi_view_franka(self, sample: dict[str, Any]) -> torch.Tensor: # returns [T,C,H',W'] + top_or_front_key = ( + "observation.images.camera_top" + if self._embodiment_type == "robomind-franka" + else "observation.images.camera_front" + ) + top_or_front = sample[top_or_front_key] # [T,C,H,W] + left = sample["observation.images.camera_left"] # [T,C,H,W] + right = sample["observation.images.camera_right"] # [T,C,H,W] + + _, _, height_ref, width_ref = top_or_front.shape + half_height, half_width = height_ref // 2, width_ref // 2 + + left = F.interpolate( + left, size=(half_height, half_width), mode="bilinear", align_corners=False + ) # [T,C,H/2,W/2] + right = F.interpolate( + right, size=(half_height, half_width), mode="bilinear", align_corners=False + ) # [T,C,H/2,W/2] + bottom = torch.cat([left, right], dim=-1) # [T,C,H/2,W] + + composite = torch.cat([top_or_front, bottom], dim=-2) # [T,C,3H/2,W] + return composite # [T,C,3H/2,W] + + def _build_action_spec(self) -> ActionSpec: + """RoboMIND Franka: 10D single-arm or 20D dual-arm. + + Single (``robomind-franka``): + ``[Pos, Rot6d, Gripper]`` (10D) + + Dual (``robomind-franka-dual``): + ``[L_Pos, L_Rot6d, L_Gripper, R_Pos, R_Rot6d, R_Gripper]`` (20D) + """ + if self._embodiment_type == "robomind-franka": + return build_action_spec(Pos(), Rot("rot6d"), Gripper()) + # dual arm + return build_action_spec( + Pos(prefix="left"), + Rot("rot6d", prefix="left"), + Gripper(prefix="left"), + Pos(prefix="right"), + Rot("rot6d", prefix="right"), + Gripper(prefix="right"), + ) + + def __getitem__(self, idx: int) -> dict[str, Any]: + mode, _, _, sample = self._fetch_sample(idx) + ai_caption = sample["task"] + + if self._skip_video_loading: + video = None + additional_view_description = None + elif self._is_concat_view: + video = self._compose_multi_view_franka(sample) + additional_view_description = ( + "The top row shows third-person perspective view looking towards the robot from the front. " + "The bottom-left video shows the third-person perspective view looking at the scene from the left side. " + "The bottom-right video shows the third-person perspective view looking at the scene from the right side." + ) + elif self._embodiment_type == "robomind-franka": + video = sample["observation.images.camera_top"] # [T,C,H,W] + additional_view_description = None + elif self._embodiment_type == "robomind-franka-dual": + video = sample["observation.images.camera_front"] # [T,C,H,W] + additional_view_description = None + else: + raise ValueError(f"Unknown embodiment: {self._embodiment_type}") + + action, initial_pose = self._build_action(sample) + + extras: dict[str, Any] = {} + if isinstance(initial_pose, tuple): + extras["initial_pose"] = initial_pose[0] + extras["initial_pose_right"] = initial_pose[1] + else: + extras["initial_pose"] = initial_pose + + if additional_view_description is not None: + extras["additional_view_description"] = additional_view_description + + return self._build_result( + mode=mode, + video=video, + action=action, + ai_caption=ai_caption, + **extras, + ) + + @property + def action_dim(self) -> int: + if self._embodiment_type == "robomind-franka": + return 10 + if self._embodiment_type == "robomind-franka-dual": + return 20 + raise ValueError(f"Unknown embodiment: {self._embodiment_type}") diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/umi_lerobot_dataset.py b/cosmos-framework/cosmos_framework/data/vfm/action/umi_lerobot_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..a76531799c80281448955226a7f1213bf49ce654 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/umi_lerobot_dataset.py @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Small local LeRobot wrapper for FastUMI-style UMI demo shards.""" + +from __future__ import annotations + +from typing import Any, Literal + +import numpy as np +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata + +from cosmos_framework.data.vfm.action.action_spec import ActionSpec, Gripper, Pos, Rot, build_action_spec +from cosmos_framework.data.vfm.action.cosmos3_action_lerobot import ActionNormalization, BaseActionLeRobotDataset +from cosmos_framework.data.vfm.action.pose_utils import PoseConvention, build_abs_pose_from_components, pose_abs_to_rel +from cosmos_framework.data.vfm.action.viewpoint_utils import Viewpoint + +_UMI_TO_OPENCV = np.eye(3, dtype=np.float32) +_IMAGE_FEATURE = "observation.image.right_main_camera_rgb" +_POSE_FEATURE = "observation.state.right_main_camera_trajectory_xyz_wxyz" +_GRIPPER_FEATURE = "observation.state.right_gripper_width_m" + + +class UMIFastLeRobotDataset(BaseActionLeRobotDataset): + """UMI/FastUMI LeRobot shard wrapper: 10D = [pos, rot6d, gripper].""" + + def __init__( + self, + root: str = "/app/assets/examples/fastumi/fastumi_single_arm/pour_coke", + fps: float = 20.0, + chunk_length: int = 16, + split_seed: int = 42, + split_val_ratio: float = 0.02, + split: Literal["train", "val", "full"] = "train", + mode: str = "policy", + pose_convention: PoseConvention = "backward_framewise", + action_normalization: ActionNormalization | None = None, + viewpoint: Viewpoint = "ego_view", + enable_fast_init: bool = False, + ) -> None: + super().__init__( + fps=fps, + chunk_length=chunk_length, + split_seed=split_seed, + split_val_ratio=split_val_ratio, + split=split, + mode=mode, + embodiment_type="umi", + viewpoint=viewpoint, + pose_convention=pose_convention, + rotation_format="rot6d", + action_normalization=action_normalization, + tolerance_s=1e-4, + enable_fast_init=enable_fast_init, + ) + self._to_opencv = _UMI_TO_OPENCV + self._all_shard_roots = [root] + observation_ts = [i * self._dt for i in range(0, self._chunk_length + 1)] + action_ts = [i * self._dt for i in range(0, self._chunk_length)] + self._delta_timestamps = { + _IMAGE_FEATURE: observation_ts, + _POSE_FEATURE: observation_ts, + _GRIPPER_FEATURE: observation_ts, + } + + def _build_action_spec(self) -> ActionSpec: + return build_action_spec(Pos(), Rot("rot6d"), Gripper()) + + def _register_sources(self, shard_indices: list[int] | None = None) -> None: + roots = self._all_shard_roots if shard_indices is None else [self._all_shard_roots[i] for i in shard_indices] + for root in roots: + self._register_source( + root=root, + delta_timestamps=self._delta_timestamps, + tolerance_s=self._tolerance_s, + video_backend="torchcodec", + dataset_label=root.rstrip("/").split("/")[-1], + ) + + def _filter_valid_episodes(self, meta: LeRobotDatasetMetadata, episode_ids: list[int]) -> list[int]: + return episode_ids + + def __getitem__(self, idx: int) -> dict[str, Any]: + mode, _, _, sample = self._fetch_sample(idx) + video = sample[_IMAGE_FEATURE] + pose = sample[_POSE_FEATURE].float() # [T+1, 7] xyz + wxyz + grip = sample[_GRIPPER_FEATURE].float() + if grip.ndim == 1: + grip = grip.unsqueeze(-1) + # Stored quaternion is wxyz; convert to rot6d and build absolute poses. + poses_abs = build_abs_pose_from_components( + pose[:, :3].cpu().numpy(), + pose[:, 3:7].cpu().numpy(), + rotation_input_format="quat_wxyz", + ) + rel = pose_abs_to_rel(poses_abs, rotation_format="rot6d", pose_convention=self._pose_convention) + action = torch.from_numpy(np.concatenate((rel, grip[:-1].cpu().numpy()), axis=-1)).float() + return self._build_result( + mode=mode, + video=video, + action=action, + ai_caption=sample.get("task", "UMI/FastUMI demonstration"), + ) diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/README.md b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4548dab4edc7ada093506e66316050582fad194 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/README.md @@ -0,0 +1,66 @@ +# Action Dataset URDF Viewer + +Interactive 3D viewer for release-supported robot action datasets. It renders +RGB frames, end-effector trajectories, and robot meshes/IK with +[`viser`](https://viser.studio/), MuJoCo, and Pinocchio. + +The OSS viewer registry is intentionally scoped to the public action datasets: + +- Bridge LeRobot v3: [`nvidia/bridge_lerobot_v3`](https://huggingface.co/datasets/nvidia/bridge_lerobot_v3) +- LIBERO LeRobot v3: [`nvidia/LIBERO_LeRobot_v3`](https://huggingface.co/datasets/nvidia/LIBERO_LeRobot_v3) + +The dataset LazyConfig entries live in: + +```text +cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py +``` + +## Dataset roots + +Set dataset roots before launching. Dedicated variables are preferred when both +datasets are available; `DATASET_PATH` remains a convenient fallback for a +single downloaded dataset. + +```bash +# Bridge root: downloaded from nvidia/bridge_lerobot_v3 or an existing mirror. +export BRIDGE_LEROBOT_ROOT=/path/to/bridge_lerobot_v3 + +# LIBERO root: parent directory containing libero_10/, libero_object/, +# libero_spatial/, and libero_goal/. +export LIBERO_ROOT=/path/to/LIBERO_LeRobot_v3 +``` + +To download the public datasets instead: + +```bash +uvx hf@latest download --repo-type dataset nvidia/bridge_lerobot_v3 \ + --local-dir examples/data/bridge_lerobot_v3 --quiet + +uvx hf@latest download --repo-type dataset nvidia/LIBERO_LeRobot_v3 \ + --revision ddc1edeb6e51e2b7d4d2ba7a1433daaecd37aa64 \ + --local-dir examples/data/LIBERO_LeRobot_v3 --quiet +``` + +## Local launch + +Install viewer-only dependencies if they are missing: + +```bash +uv pip install viser mujoco pin +``` + +Then launch from the repository root: + +```bash +export PYTHONPATH=. +export BRIDGE_LEROBOT_ROOT=/path/to/bridge_lerobot_v3 +export LIBERO_ROOT=/path/to/LIBERO_LeRobot_v3 + +python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py \ + --port 8020 \ + --share +``` + +`--share` asks `viser` to create a public URL. The share URL expires after about +24 hours and is best treated as disposable; relaunch the viewer if the share +link stops responding. diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/__init__.py b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..503ec1b18d584ba1c349360dedbe6951e3216df6 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..0cf9327676f5edc70ff449537acd66efe589d1cb --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/action_datasets.py @@ -0,0 +1,159 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Minimal release-supported action dataset configs for the URDF viewer.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import Any, Callable + +from cosmos_framework.data.vfm.action.av_dataset import AVDataset +from cosmos_framework.data.vfm.action.bridge_orig_lerobot_dataset import BridgeOrigLeRobotDataset +from cosmos_framework.data.vfm.action.droid_lerobot_dataset import DROIDLeRobotDataset +from cosmos_framework.data.vfm.action.fractal import FractalLeRobotDataset +from cosmos_framework.data.vfm.action.robomind_franka_dataset import RoboMINDFrankaDataset +from cosmos_framework.data.vfm.action.umi_lerobot_dataset import UMIFastLeRobotDataset + +_DEFAULT_LUSTRE_DATASET_ROOT = "/lustre/fsw/portfolios/cosmos/projects/cosmos_base_training/cosmos3_action_datasets" + + +@dataclass +class LazyCall: + """Tiny LazyCall replacement sufficient for the standalone viewer.""" + + target: Callable[..., Any] + + def __call__(self, **kwargs: Any) -> dict[str, Any]: + return {"_target_": self.target, **kwargs} + + +L = LazyCall + + +def dataset_entry(name: str, dataset: Any, ratio: float = 1.0, resolution: str | None = None) -> dict[str, Any]: + return {"name": name, "dataset": dataset, "ratio": ratio, "resolution": resolution} + + +def _env_path(*names: str, default: str) -> str: + for name in names: + value = os.environ.get(name) + if value: + return value + return default + + +BRIDGE_ROOT = _env_path("BRIDGE_LEROBOT_ROOT", "DATASET_PATH", default="/app/assets/examples/bridge_lerobot_v3") + +DATASET_BRIDGE_480 = L(dataset_entry)( + name="bridge_20260501", + dataset=L(BridgeOrigLeRobotDataset)( + action_normalization="quantile_rot", + chunk_length=16, + enable_fast_init=True, + fps=5.0, + mode="joint", + pose_convention="backward_framewise", + root=BRIDGE_ROOT, + split="full", + split_seed=42, + split_val_ratio=0.02, + viewpoint="ego_view", + ), + ratio=1.0, + resolution="480", +) + + +DATASET_FRACTAL_256 = L(dataset_entry)( + name="fractal_20260501", + dataset=L(FractalLeRobotDataset)( + root=_env_path("FRACTAL_ROOT", default="/app/assets/examples/fractal20220817_data"), + split="train", + mode="joint", + action_normalization="quantile_rot", + enable_fast_init=False, + ), + ratio=1, + resolution="256", +) + +DATASET_DROID_480 = L(dataset_entry)( + name="droid_20260501", + dataset=L(DROIDLeRobotDataset)( + root=_env_path("DROID_ROOT", default="/app/assets/examples/droid_plus_lerobot_640x360_20260412"), + split="train", + use_success_only=True, + mode="joint", + action_normalization="quantile_rot", + enable_fast_init=False, + ), + ratio=1, + resolution="480", +) + +DATASET_ROBOMIND_FRANKA_480 = L(dataset_entry)( + name="robomind_franka_20260501", + dataset=L(RoboMINDFrankaDataset)( + root=_env_path("ROBOMIND_FRANKA_ROOT", "ROBOMIND_ROOT", default="/app/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1"), + split="train", + mode="joint", + embodiment_type="robomind-franka", + action_normalization="quantile_rot", + enable_fast_init=False, + ), + ratio=1, + resolution="480", +) + +DATASET_ROBOMIND_FRANKA_DUAL_480 = L(dataset_entry)( + name="robomind_franka_dual_20260501", + dataset=L(RoboMINDFrankaDataset)( + root=_env_path("ROBOMIND_FRANKA_DUAL_ROOT", "ROBOMIND_ROOT", default="/app/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water"), + split="train", + mode="joint", + embodiment_type="robomind-franka-dual", + action_normalization="quantile_rot", + enable_fast_init=False, + ), + ratio=1, + resolution="480", +) + + +DATASET_AV_480 = L(dataset_entry)( + name="av_480_20260501", + dataset=L(AVDataset)( + root=_env_path("AV_ROOT", default="/app/assets/examples/av_v2_03292026_wdinfo"), + split="train", + fps=10, + mode="joint", + history_len=0.1, + future_len=6.0, + rotation_format="rot6d", + pose_convention="backward_framewise", + translation_scale=1.35, + max_action_translation_norm=10, + resolution="480", + shuffle=False, + include_route_in_prompt=True, + use_semantic_route_prompt=True, + align_opencv_pose=False, + ), + ratio=1, + resolution="480", +) + +DATASET_UMI_256 = L(dataset_entry)( + name="umi_20260501", + dataset=L(UMIFastLeRobotDataset)( + root=_env_path("UMI_ROOT", default="/app/assets/examples/fastumi/fastumi_single_arm/pour_coke"), + split="train", + mode="joint", + action_normalization=None, + enable_fast_init=False, + ), + ratio=1, + resolution="256", +) diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/droid_franka_robotiq_2f85.xml b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/droid_franka_robotiq_2f85.xml new file mode 100644 index 0000000000000000000000000000000000000000..795f68b9f7a1b3500688b558a1904f6233596094 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/droid_franka_robotiq_2f85.xml @@ -0,0 +1,377 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ik_solver.py b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ik_solver.py new file mode 100644 index 0000000000000000000000000000000000000000..c6993890ac28bfcd395c90fe289ec4619bbe1320 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ik_solver.py @@ -0,0 +1,638 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Robot-agnostic IK solver using pinocchio + MuJoCo. + +Supports any robot loaded from MJCF (Google Robot, Franka Panda, WidowX, etc). +Auto-detects EE frame, arm vs finger joints, and uses multi-start random seeding. + +The solver: +1. Auto-discovers the EE frame from a list of candidate names +2. Determines which joints are "arm" joints (actuated for IK) vs "finger" joints +3. Uses multi-start random sampling to avoid local minima +4. Optionally sets finger joint angles from gripper opening fractions +""" + +from functools import lru_cache +from typing import Any + +import numpy as np + +from cosmos_framework.utils import log +from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import ( + get_ee_frame_candidates, + get_mujoco_to_pinocchio_world_transform, + get_robot_config, + get_urdf_path, + get_visual_geom_ids, + resolve_robot_name_from_mjcf, +) + +# ── IK Solver ──────────────────────────────────────────────────────────────── + + +def _find_ee_frame(model, robot_name: str | None = None) -> int | None: + """Find the end-effector frame ID by trying candidate names. + + If robot_name is given and its config specifies ``ee_frame``, that name + is tried first before falling through to the generic candidates. + """ + cfg = get_robot_config(robot_name) if robot_name else {} + override = cfg.get("ee_frame") + if override: + fid = model.getFrameId(override) + if fid < model.nframes: + return fid + log.warning(f"Configured ee_frame '{override}' not found in Pinocchio model") + + for name in get_ee_frame_candidates(): + fid = model.getFrameId(name) + if fid < model.nframes: + return fid + log.warning(f"Could not find EE frame by name for robot '{robot_name}', skipping IK") + return None + + +def solve_trajectory_ik( + mjcf_path: str, + world_ee_positions: np.ndarray, + gripper_openings: np.ndarray | None = None, + world_ee_orientations: np.ndarray | None = None, + robot_name: str | None = None, + max_random_samples: int = 50_000, + seed: int = 42, +) -> np.ndarray | None: + """Solve IK for a sequence of world-space EE poses (robot-agnostic). + + Args: + mjcf_path: Path to the MJCF XML file. + world_ee_positions: (T, 3) target EE positions. + gripper_openings: (T,) gripper opening fractions [0=closed, 1=open]. + world_ee_orientations: (T, 3, 3) target EE rotation matrices. + robot_name: Robot name for config lookup (optional, inferred from path). + max_random_samples: Number of random configs to try for initial seed. + seed: Random seed. + + Returns: + (T, nq) joint configurations, or None if IK fails. + """ + import pinocchio as pin + + model, cfg = _build_pinocchio_model(mjcf_path, robot_name) + log.info(f"IK: pinocchio model nq={model.nq}") + data = model.createData() + ee_id = _find_ee_frame(model, robot_name) + if ee_id is None: + return None + ee_name = model.frames[ee_id].name + log.info(f"IK: using EE frame '{ee_name}' (id={ee_id}), nq={model.nq}") + + T = len(world_ee_positions) + use_6dof = world_ee_orientations is not None and len(world_ee_orientations) == T + + # Apply TCP offset: dataset EE poses may be at the TCP (e.g. ee_gripper_link) + # while the Pinocchio frame is at the kinematic link origin (e.g. gripper_link). + # Convert target TCP poses → IK link-frame targets: + # p_link = p_tcp - R_tcp @ tcp_offset + tcp_offset = cfg.get("tcp_offset") + if tcp_offset is not None: + tcp_offset = np.asarray(tcp_offset, dtype=np.float32) + log.info(f"IK: applying TCP offset {tcp_offset} to target positions") + world_ee_positions = world_ee_positions.copy() + for t in range(T): + if use_6dof: + world_ee_positions[t] -= world_ee_orientations[t] @ tcp_offset + else: + world_ee_positions[t] -= tcp_offset + + lower = model.lowerPositionLimit.copy() + upper = model.upperPositionLimit.copy() + + # Determine arm joints vs finger joints + # After model reduction, arm joints are first, fingers follow + n_arm = cfg.get("n_arm_joints", model.nq - 2) # default: all but last 2 are arm + n_finger = model.nq - n_arm + + log.info(f"IK: {n_arm} arm joints + {n_finger} finger joints") + + # ── 6-DoF CLIK (position + orientation) ── + def _ik_6dof(target_pos, target_rot, q_init, max_iter=800, eps_pos=5e-5, eps_rot=1e-3, dt=0.1, damp=1e-4): + q = q_init.copy() + best_q = q.copy() + best_total = float("inf") + stall_count = 0 + rot_weight = 1.0 + + for it in range(max_iter): + pin.forwardKinematics(model, data, q) + pin.updateFramePlacements(model, data) + + pos_err = target_pos - data.oMf[ee_id].translation + pos_norm = np.linalg.norm(pos_err) + R_err = target_rot @ data.oMf[ee_id].rotation.T + rot_err = pin.log3(R_err) + rot_norm = np.linalg.norm(rot_err) + + if pos_norm < eps_pos and rot_norm < eps_rot: + return q, pos_norm, rot_norm, it + 1 + + total = pos_norm + 0.05 * rot_norm + if total < best_total: + best_total = total + best_q = q.copy() + stall_count = 0 + else: + stall_count += 1 + + if stall_count > 50 and rot_weight > 0.1: + rot_weight *= 0.8 + stall_count = 0 + if stall_count > 150: + break + + err6 = np.concatenate([pos_err, rot_weight * rot_err]) + J = pin.computeFrameJacobian(model, data, q, ee_id, pin.LOCAL_WORLD_ALIGNED).copy() + J[3:, :] *= rot_weight + # Zero out finger joint columns + J[:, n_arm:] = 0 + JJt = J @ J.T + damp * np.eye(6) + v = J.T @ np.linalg.solve(JJt, err6) + v[n_arm:] = 0 # don't move finger joints + + q = pin.integrate(model, q, v * dt) + q = np.clip(q, lower, upper) + + pin.forwardKinematics(model, data, best_q) + pin.updateFramePlacements(model, data) + pos_norm = np.linalg.norm(target_pos - data.oMf[ee_id].translation) + rot_norm = np.linalg.norm(pin.log3(target_rot @ data.oMf[ee_id].rotation.T)) + return best_q, pos_norm, rot_norm, max_iter + + # ── 3-DoF CLIK (position only) ── + def _ik_3dof(target_pos, q_init, max_iter=500, eps=5e-5, dt=0.15, damp=1e-4): + q = q_init.copy() + for it in range(max_iter): + pin.forwardKinematics(model, data, q) + pin.updateFramePlacements(model, data) + err = target_pos - data.oMf[ee_id].translation + if np.linalg.norm(err) < eps: + return q, np.linalg.norm(err), it + 1 + J = pin.computeFrameJacobian(model, data, q, ee_id, pin.LOCAL_WORLD_ALIGNED)[:3] + J[:, n_arm:] = 0 + v = J.T @ np.linalg.solve(J @ J.T + damp * np.eye(3), err) + v[n_arm:] = 0 + q = pin.integrate(model, q, v * dt) + q = np.clip(q, lower, upper) + return q, np.linalg.norm(err), max_iter + + def _solve_full_trajectory(seed_q): + configs = [] + max_pe = 0.0 + max_re = 0.0 + q = seed_q.copy() + for t in range(T): + if use_6dof: + q, pe, re, _ = _ik_6dof(world_ee_positions[t], world_ee_orientations[t], q) + max_pe = max(max_pe, float(pe)) + max_re = max(max_re, float(re)) + else: + q, pe, _ = _ik_3dof(world_ee_positions[t], q) + max_pe = max(max_pe, float(pe)) + configs.append(q.copy()) + return np.array(configs), max_pe, max_re + + # ── Multi-start seed search ── + # For robots with a base rotation joint (Google Robot torso, WidowX waist), + # search multiple rotation basins. For Franka (no base rotation freedom), + # use a single wider search. + base_joint_range = upper[0] - lower[0] + if base_joint_range > 4.0: + # Wide base rotation — split into basins (Google Robot, WidowX) + n_basins = 4 + basin_size = base_joint_range / n_basins + basins = [] + for i in range(n_basins): + b_lo = lower[0] + i * basin_size + b_hi = lower[0] + (i + 1) * basin_size + basins.append((b_lo, b_hi)) + else: + # No wide base rotation — single basin (Franka) + basins = [(lower[0], upper[0])] + + samples_per_basin = max_random_samples // len(basins) + target0_pos = world_ee_positions[0] + target0_rot = world_ee_orientations[0] if use_6dof else None + + best_overall_configs = None + best_overall_score = float("inf") + best_basin_info = "" + best_max_pe = 0.0 + best_max_re = 0.0 + + for basin_idx, (b_lo, b_hi) in enumerate(basins): + rng = np.random.RandomState(seed + basin_idx) + basin_lower = lower.copy() + basin_upper = upper.copy() + basin_lower[0] = max(lower[0], b_lo) + basin_upper[0] = min(upper[0], b_hi) + + # Find best seed in this basin + basin_best_q = pin.neutral(model) + basin_best_q[0] = (b_lo + b_hi) / 2 + basin_best_score = float("inf") + + for _ in range(samples_per_basin): + q = rng.uniform(basin_lower, basin_upper) + pin.forwardKinematics(model, data, q) + pin.updateFramePlacements(model, data) + pos_err = np.linalg.norm(data.oMf[ee_id].translation - target0_pos) + + if target0_rot is not None: + rot_err = np.linalg.norm(pin.log3(target0_rot.T @ data.oMf[ee_id].rotation)) + score = pos_err + 0.3 * rot_err + else: + score = pos_err + + if score < basin_best_score: + basin_best_score = score + basin_best_q = q.copy() + if pos_err < 0.005 and (target0_rot is None or rot_err < 0.1): + break + + if basin_best_score > 0.5: + continue + + configs, max_pe, max_re = _solve_full_trajectory(basin_best_q) + traj_score = max_pe + 0.05 * max_re + log.info( + f" Basin [{b_lo:+.1f}, {b_hi:+.1f}]: seed_score={basin_best_score:.4f}, " + f"traj max_pos={max_pe * 1000:.1f}mm, max_rot={np.degrees(max_re):.1f}°, " + f"j0={basin_best_q[0]:+.2f}rad" + ) + + if traj_score < best_overall_score: + best_overall_score = traj_score + best_overall_configs = configs + best_basin_info = f"j0_basin=[{b_lo:+.1f},{b_hi:+.1f}], seed_j0={basin_best_q[0]:+.2f}rad" + best_max_pe = max_pe + best_max_re = max_re + + if best_overall_configs is None: + log.warning("IK failed: no basin converged") + return None + + configs = best_overall_configs + if use_6dof: + log.info( + f"IK solved ({T} frames, 6-DoF): max_pos={best_max_pe * 1000:.2f}mm, max_rot={np.degrees(best_max_re):.1f}° [{best_basin_info}]" + ) + else: + log.info(f"IK solved ({T} frames, 3-DoF): max_pos={best_max_pe * 1000:.2f}mm [{best_basin_info}]") + + # ── Set finger joints from gripper openings ── + if n_finger > 0: + finger_min = cfg.get("finger_min", lower[n_arm]) + finger_max = cfg.get("finger_max", upper[n_arm]) + else: + finger_min = 0.0 + finger_max = 0.0 + close_is_max = cfg.get("finger_close_is_max", True) + finger_joint_names = cfg.get("finger_joint_names") + + if gripper_openings is not None and len(gripper_openings) == T: + # Find finger joint indices by name if specified (e.g., Robotiq driver joints) + if finger_joint_names: + # Use Pinocchio joint name lookup + finger_indices = [] + for fjn in finger_joint_names: + # Pinocchio joint names include the joint name from MJCF + jid = model.getJointId(fjn) + if jid < model.njoints: + # Pinocchio joint index → qpos index + qi = model.idx_qs[jid] + finger_indices.append(qi) + else: + log.warning(f"Finger joint '{fjn}' not found in Pinocchio model") + if not finger_indices: + finger_indices = list(range(n_arm, n_arm + n_finger)) + else: + finger_indices = list(range(n_arm, n_arm + n_finger)) + + for t in range(T): + g = float(np.clip(gripper_openings[t], 0.0, 1.0)) + if close_is_max: + # Robotiq/Google Robot: high angle = closed + angle = finger_max - g * (finger_max - finger_min) + else: + # Franka/WidowX: high value = open + angle = finger_min + g * (finger_max - finger_min) + + for ji in finger_indices: + # WidowX right finger is negative range + if lower[ji] < 0 and upper[ji] < 0: + configs[t, ji] = -(finger_min + g * (finger_max - finger_min)) + else: + configs[t, ji] = angle + log.info( + f"Finger joints set from gripper openings ({gripper_openings.min():.2f} to {gripper_openings.max():.2f})" + ) + + return configs + + +def _build_pinocchio_model(mjcf_path: str, robot_name: str | None = None): + """Build a pinocchio model, reducing to arm+finger joints if configured. + + For URDF models (SimplerEnv), builds from URDF and locks non-arm joints. + For MJCF models (Menagerie), builds from MJCF and locks non-arm joints. + Shared by solve_trajectory_ik and compute_fk_ee_poses. + """ + import pinocchio as pin + + cfg = get_robot_config(robot_name) if robot_name else {} + urdf_path = get_urdf_path(robot_name) if robot_name else None + + if urdf_path: + full_model = pin.buildModelFromUrdf(urdf_path) + else: + full_model = pin.buildModelFromMJCF(mjcf_path) + + # Reduce model to arm + finger joints only (lock base, wheels, etc.) + arm_joint_names = cfg.get("arm_joints", []) + finger_jnames = cfg.get("finger_joint_names", []) + keep_names = set(arm_joint_names) | set(finger_jnames) + + if keep_names: + lock_ids = [ji for ji in range(1, full_model.njoints) if full_model.names[ji] not in keep_names] + if lock_ids: + q_ref = pin.neutral(full_model) + model = pin.buildReducedModel(full_model, lock_ids, q_ref) + else: + model = full_model + else: + model = full_model + + return model, cfg + + +def compute_fk_ee_poses( + mjcf_path: str, + joint_configs: np.ndarray, + robot_name: str | None = None, +) -> tuple[np.ndarray, np.ndarray]: + """Run FK and return EE positions and orientations.""" + import pinocchio as pin + + model, _ = _build_pinocchio_model(mjcf_path, robot_name) + data = model.createData() + ee_id = _find_ee_frame(model, robot_name) + if ee_id is None: + log.warning(f"Skipping FK — no EE frame found for robot '{robot_name}'") + T = len(joint_configs) + return np.zeros((T, 3)), np.zeros((T, 3, 3)) + + T = len(joint_configs) + fk_positions = np.zeros((T, 3)) + fk_orientations = np.zeros((T, 3, 3)) + + for t in range(T): + pin.forwardKinematics(model, data, joint_configs[t]) + pin.updateFramePlacements(model, data) + fk_positions[t] = data.oMf[ee_id].translation.copy() + fk_orientations[t] = data.oMf[ee_id].rotation.copy() + + return fk_positions, fk_orientations + + +def verify_ik_with_fk( + mjcf_path: str, + joint_configs: np.ndarray, + target_positions: np.ndarray, + target_orientations: np.ndarray | None = None, +) -> dict: + """Verify IK solution by running FK and comparing to targets.""" + import pinocchio as pin + + fk_pos, fk_rot = compute_fk_ee_poses(mjcf_path, joint_configs) + if fk_pos is None: + return None + + T = len(joint_configs) + pos_errors_mm = np.linalg.norm(fk_pos - target_positions, axis=1) * 1000 + + rot_errors_deg = None + if target_orientations is not None: + rot_errors_deg = np.zeros(T) + for t in range(T): + R_err = target_orientations[t].T @ fk_rot[t] + angle = np.linalg.norm(pin.log3(R_err)) + rot_errors_deg[t] = np.degrees(angle) + + summary = f"FK Verification ({T} frames): pos mean={pos_errors_mm.mean():.2f}mm max={pos_errors_mm.max():.2f}mm" + if rot_errors_deg is not None: + summary += f", rot mean={rot_errors_deg.mean():.1f}° max={rot_errors_deg.max():.1f}°" + + return { + "fk_positions": fk_pos, + "fk_orientations": fk_rot, + "pos_errors_mm": pos_errors_mm, + "rot_errors_deg": rot_errors_deg, + "summary": summary, + } + + +def compute_mujoco_geom_transforms( + mjcf_path: str, + joint_configs: np.ndarray, +) -> tuple[ + list[list[tuple[np.ndarray, np.ndarray]]], + list[tuple[np.ndarray, np.ndarray]] | None, + list[tuple[np.ndarray, np.ndarray]] | None, + dict[str, list[tuple[np.ndarray, np.ndarray]]] | None, +]: + """Compute MuJoCo geom/body/site transforms in the Pinocchio-aligned world. + + Also extracts camera site pose, EE body pose, and named body/site frames. + + Important: some MJCFs (notably MuJoCo Menagerie's Google Robot) include a + fixed ``worldbody -> root_body`` transform that Pinocchio's + ``buildModelFromMJCF()`` omits. Dataset poses and IK targets already live in + Pinocchio's root-free world, so we explicitly remove that MuJoCo root + transform here before returning any MuJoCo-derived poses. + + Returns: + (all_geom_transforms, camera_poses_or_None, ee_poses_or_None, robot_frames_or_None) + - all_geom_transforms: list of per-frame geom transforms [(pos, mat), ...] + - camera_poses: list of (pos, mat) per frame for 'camera_site', or None if no site. + - ee_poses: list of (pos, mat) per frame for the EE body, or None. + - robot_frames: dict mapping ``body:`` / ``site:`` to per-frame poses. + """ + import mujoco + + model = mujoco.MjModel.from_xml_path(mjcf_path) + data = mujoco.MjData(model) + + visual_geom_ids = get_visual_geom_ids(model) + + # Determine which robot config applies (by matching MJCF filename) + robot_name = resolve_robot_name_from_mjcf(mjcf_path) + cfg = get_robot_config(robot_name) if robot_name is not None else {} + + # Find camera_site if it exists + camera_site_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_SITE, "camera_site") + has_camera_site = camera_site_id >= 0 + if has_camera_site: + body_id = model.site_bodyid[camera_site_id] + body_name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_BODY, body_id) or "?" + log.info(f"Found camera_site (id={camera_site_id}) on body '{body_name}'") + + # Also look up camera body (e.g. zed_mini for DROID) + camera_body_id = -1 + camera_body_name = cfg.get("camera_body") + if camera_body_name and not has_camera_site: + camera_body_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, camera_body_name) + if camera_body_id >= 0: + log.info(f"Found camera body '{camera_body_name}' (id={camera_body_id})") + + all_transforms = [] + camera_poses = [] if (has_camera_site or camera_body_id >= 0) else None + robot_frame_specs = [] + for body_id in range(1, model.nbody): + body_name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_BODY, body_id) or "" + if body_name: + robot_frame_specs.append(("body", body_name, body_id)) + for site_id in range(model.nsite): + site_name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_SITE, site_id) or "" + if site_name: + robot_frame_specs.append(("site", site_name, site_id)) + robot_frames = {f"{kind}:{name}": [] for kind, name, _ in robot_frame_specs} if robot_frame_specs else None + + # Find EE body for extracting FK-derived EE pose + ee_body_id = -1 + ee_override = cfg.get("ee_frame") + ee_candidates = get_ee_frame_candidates(robot_name) + for candidate in ee_candidates: + bid = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, candidate) + if bid >= 0: + ee_body_id = bid + break + ee_poses = [] if ee_body_id >= 0 else None + + # Find driver joint indices for robots with finger_joint_names + finger_joint_names = cfg.get("finger_joint_names", []) + arm_jnames = cfg.get("arm_joints", []) if cfg else [] + # Indices to pin during constraint settling: arm joints + driver joints + n_arm = cfg.get("n_arm_joints", 7) if cfg else 7 + if arm_jnames: + # Use name-based mapping for arm joint indices + pin_indices = [] + for jn in arm_jnames: + jid = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_JOINT, jn) + if jid >= 0: + pin_indices.append(model.jnt_qposadr[jid]) + else: + pin_indices = list(range(n_arm)) + if finger_joint_names: + for fjn in finger_joint_names: + for ji in range(model.njnt): + jname = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_JOINT, ji) or "" + if jname == fjn: + qi = model.jnt_qposadr[ji] + pin_indices.append(qi) + + # Build mapping from IK output (pinocchio reduced model) to MuJoCo qpos indices. + # When a URDF is used with model reduction, the IK output has only arm+finger + # joints in pinocchio order. We need to map those to MuJoCo's qpos order. + arm_jnames = cfg.get("arm_joints", []) if cfg else [] + pin_to_mj_map = None # None = direct mapping (qpos[:len(q)] = q) + if arm_jnames: + # Build ordered list: arm joints first, then finger joints + ordered_jnames = list(arm_jnames) + list(finger_joint_names) + mj_indices = [] + for jn in ordered_jnames: + jid = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_JOINT, jn) + if jid >= 0: + mj_indices.append(model.jnt_qposadr[jid]) + if mj_indices: + pin_to_mj_map = mj_indices + + # If joint_configs has more columns than n_arm, the extra column is a + # normalized gripper signal (raw UR: 0=open, 1=closed). Robotiq ctrl + # matches: 0=open, 255=closed → ctrl = raw * finger_max (no inversion). + finger_max = cfg.get("finger_max", 0.0) if cfg else 0.0 + has_gripper_ctrl = finger_max > 0.0 and model.nu > n_arm and joint_configs.shape[1] > n_arm + + def _apply_world_correction(pos: np.ndarray, mat: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """Map one MuJoCo world pose into the root-free Pinocchio world.""" + corrected_pos = world_correction[:3, :3] @ pos + world_correction[:3, 3] + corrected_mat = world_correction[:3, :3] @ mat + return corrected_pos.astype(np.float32), corrected_mat.astype(np.float32) + + for q in joint_configs: + if pin_to_mj_map and len(q) == len(pin_to_mj_map): + # Full arm+finger mapping from IK (e.g. Franka): use all joints. + data.qpos[:] = 0 + for i, mi in enumerate(pin_to_mj_map): + data.qpos[mi] = q[i] + else: + # For robots with a separate gripper ctrl signal (e.g. UR5e), + # only write arm joints — the 7th column is a raw gripper value, + # not a qpos DOF. For other robots the pinocchio output already + # includes finger joints in the trailing columns; write them all. + n_set = n_arm if has_gripper_ctrl else len(q) + data.qpos[:n_set] = q[:n_set] + mujoco.mj_forward(model, data) + + # For robots with equality constraints (e.g., Robotiq 4-bar linkage), + # step physics to let constraints resolve the passive linkage joints. + if model.neq > 0: + data.qvel[:] = 0 + data.ctrl[:] = 0 + # Raw UR gripper maps directly to Robotiq ctrl: 0=open, 255=closed. + if has_gripper_ctrl: + data.ctrl[-1] = float(q[n_arm]) * finger_max + gripper_ctrl_val = float(data.ctrl[-1]) if model.nu > 0 else 0.0 + saved = data.qpos[pin_indices].copy() + for _ in range(200): + mujoco.mj_step(model, data) + data.qpos[pin_indices] = saved + if model.nu > 0: + data.ctrl[-1] = gripper_ctrl_val # keep gripper ctrl during settling + data.qvel[:] = 0 + mujoco.mj_forward(model, data) + world_correction = get_mujoco_to_pinocchio_world_transform(model, data, robot_name) + + frame_transforms = [] + for gi in visual_geom_ids: + pos = data.geom_xpos[gi].copy() + mat = data.geom_xmat[gi].reshape(3, 3).copy() + frame_transforms.append(_apply_world_correction(pos, mat)) + all_transforms.append(frame_transforms) + + # Extract camera site pose + if has_camera_site: + cam_pos = data.site_xpos[camera_site_id].copy() + cam_mat = data.site_xmat[camera_site_id].reshape(3, 3).copy() + camera_poses.append(_apply_world_correction(cam_pos, cam_mat)) + elif camera_body_id >= 0: + cam_pos = data.xpos[camera_body_id].copy() + cam_mat = data.xmat[camera_body_id].reshape(3, 3).copy() + camera_poses.append(_apply_world_correction(cam_pos, cam_mat)) + + # Extract EE body pose + if ee_body_id >= 0: + ee_pos = data.xpos[ee_body_id].copy() + ee_mat = data.xmat[ee_body_id].reshape(3, 3).copy() + ee_poses.append(_apply_world_correction(ee_pos, ee_mat)) + + if robot_frames is not None: + for kind, name, frame_id in robot_frame_specs: + if kind == "body": + pos = data.xpos[frame_id].copy() + mat = data.xmat[frame_id].reshape(3, 3).copy() + else: + pos = data.site_xpos[frame_id].copy() + mat = data.site_xmat[frame_id].reshape(3, 3).copy() + robot_frames[f"{kind}:{name}"].append(_apply_world_correction(pos, mat)) + + return all_transforms, camera_poses, ee_poses, robot_frames diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/robot_scene_model.py b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/robot_scene_model.py new file mode 100644 index 0000000000000000000000000000000000000000..0dfc5c7673d29c00da4407859e3ceb48277dc4c0 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/robot_scene_model.py @@ -0,0 +1,216 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Public robot-scene abstraction for the action viewer. + +`RobotSceneModel` is the viewer-facing contract for robot assets and +kinematics. It wraps the lower-level MuJoCo / Pinocchio helpers so callers do +not need to coordinate mesh loading, IK, frame extraction, or world-alignment +corrections themselves. +""" + +from __future__ import annotations + +import os +from collections.abc import Callable +from dataclasses import dataclass +from typing import Any + +import numpy as np + +from cosmos_framework.data.vfm.action.urdf_visualizer import urdf_loader + +MeshSpec = tuple[str, object, np.ndarray] +FramePose = tuple[np.ndarray, np.ndarray] +FrameSeries = dict[str, list[FramePose]] + + +@dataclass(frozen=True) +class RobotTrajectory: + """Render-ready robot geometry for one solved trajectory.""" + + mesh_transforms: list[list[FramePose]] + named_frames: FrameSeries + + +def get_robot_config(robot_name: str) -> dict[str, Any]: + """Return the static config for one supported robot.""" + cfg = urdf_loader.ROBOT_CONFIGS.get(robot_name) + if cfg is None: + raise ValueError(f"Unknown robot: {robot_name}. Available: {list(urdf_loader.ROBOT_CONFIGS)}") + return cfg + + +def get_urdf_path(robot_name: str) -> str | None: + """Return the preferred URDF path for one robot, if available.""" + return urdf_loader.get_urdf_path(robot_name) + + +def get_mjcf_path(robot_name: str) -> str: + """Return the canonical MJCF path for one robot.""" + return urdf_loader.get_mjcf_path(robot_name) + + +def get_mujoco_to_pinocchio_world_transform(model: Any, data: Any, robot_name: str | None = None) -> np.ndarray: + """Map MuJoCo world poses into the root-free Pinocchio world.""" + return urdf_loader.get_mujoco_to_pinocchio_world_transform(model, data, robot_name) + + +def get_visual_geom_ids(model: Any) -> list[int]: + """Return the MuJoCo visual geom order used by the viewer.""" + return urdf_loader._get_visual_geom_ids(model) + + +def get_ee_frame_candidates(robot_name: str | None = None) -> list[str]: + """Return the ordered end-effector frame candidates for one robot.""" + if robot_name is None: + return list(urdf_loader._EE_FRAME_CANDIDATES) + cfg = urdf_loader.ROBOT_CONFIGS.get(robot_name, {}) + ee_frame = cfg.get("ee_frame") + if ee_frame is None: + return list(urdf_loader._EE_FRAME_CANDIDATES) + return [str(ee_frame), *urdf_loader._EE_FRAME_CANDIDATES] + + +def get_robot_loaders() -> dict[str, Callable[[], tuple[list[MeshSpec], np.ndarray]]]: + """Return the low-level robot mesh loader registry.""" + return urdf_loader.get_robot_loaders() + + +def resolve_robot_name_from_mjcf(mjcf_path: str) -> str | None: + """Infer a configured robot name from an MJCF filename.""" + filename = os.path.basename(mjcf_path) + for robot_name, cfg in urdf_loader.ROBOT_CONFIGS.items(): + if filename == cfg.get("mjcf"): + return robot_name + return None + + +def _copy_mesh_specs(meshes: list[MeshSpec]) -> list[MeshSpec]: + """Copy mesh transforms while reusing immutable mesh geometry.""" + return [(name, mesh, transform.copy().astype(np.float32)) for name, mesh, transform in meshes] + + +def _transform_mesh_specs(meshes: list[MeshSpec], base_pose: np.ndarray | None) -> list[MeshSpec]: + """Apply one rigid transform to a list of world-space mesh poses.""" + copied = _copy_mesh_specs(meshes) + if base_pose is None: + return copied + transformed: list[MeshSpec] = [] + for name, mesh, transform in copied: + transformed.append((name, mesh, (base_pose @ transform).astype(np.float32))) # [4,4] + return transformed + + +def _remove_pose_base(poses_world: np.ndarray, base_pose: np.ndarray | None) -> np.ndarray: + """Map world-space wrist poses back into one arm-local base frame.""" + if base_pose is None: + return poses_world.astype(np.float32) + base_inv = np.linalg.inv(base_pose).astype(np.float32) # [4,4] + return np.einsum("ij,njk->nik", base_inv, poses_world).astype(np.float32) # [T,4,4] + + +def _apply_base_to_mesh_transforms( + transforms: list[list[FramePose]], + base_pose: np.ndarray | None, +) -> list[list[FramePose]]: + """Apply one rigid base pose to per-geom world transforms.""" + if base_pose is None: + return transforms + base_rot = base_pose[:3, :3].astype(np.float32) # [3,3] + base_pos = base_pose[:3, 3].astype(np.float32) # [3] + transformed: list[list[FramePose]] = [] + for frame_transforms in transforms: + transformed_frame: list[FramePose] = [] + for pos, rot in frame_transforms: + transformed_frame.append((base_rot @ pos + base_pos, base_rot @ rot)) + transformed.append(transformed_frame) + return transformed + + +def _apply_base_to_named_frames( + frames: FrameSeries, + base_pose: np.ndarray | None, +) -> FrameSeries: + """Apply one rigid base pose to named body/site frames.""" + if base_pose is None: + return frames + base_rot = base_pose[:3, :3].astype(np.float32) # [3,3] + base_pos = base_pose[:3, 3].astype(np.float32) # [3] + transformed: FrameSeries = {} + for frame_key, poses in frames.items(): + transformed[frame_key] = [(base_rot @ pos + base_pos, base_rot @ rot) for pos, rot in poses] + return transformed + + +class RobotSceneModel: + """Single public abstraction for robot meshes, IK/FK, and debug frames.""" + + def __init__(self, robot_name: str) -> None: + self.robot_name = robot_name + self._config = get_robot_config(robot_name) + self._mjcf_path = get_mjcf_path(robot_name) + self._home_meshes: list[MeshSpec] | None = None + + @property + def mjcf_path(self) -> str: + """Return the underlying MJCF path for this robot.""" + return self._mjcf_path + + @property + def ee_frame_name(self) -> str: + """Return the canonical IK / debug frame name for this robot.""" + return str(self._config.get("ee_frame", "ee_frame")) + + def get_home_meshes(self, base_pose: np.ndarray | None = None) -> list[MeshSpec]: + """Return home-pose meshes in the requested world frame.""" + if self._home_meshes is None: + loaders = get_robot_loaders() + loader = loaders.get(self.robot_name) + if loader is None: + raise ValueError(f"No robot loader registered for {self.robot_name}") + meshes, _ = loader() + self._home_meshes = _copy_mesh_specs(meshes) + return _transform_mesh_specs(self._home_meshes, base_pose) + + def solve_visual_trajectory( + self, + wrist_poses_world: np.ndarray | None, + gripper_openings: np.ndarray | None = None, + to_opencv: np.ndarray | None = None, + base_pose: np.ndarray | None = None, + ) -> RobotTrajectory | None: + """Solve IK for world-space wrist poses and return render-ready robot state.""" + if wrist_poses_world is None or len(wrist_poses_world) < 2: + return None + + local_wrist_poses = _remove_pose_base(wrist_poses_world, base_pose) # [T,4,4] + target_positions = local_wrist_poses[:, :3, 3].astype(np.float32) # [T,3] + target_rotations = local_wrist_poses[:, :3, :3].astype(np.float32) # [T,3,3] + if to_opencv is not None and not np.allclose(to_opencv, np.eye(3, dtype=np.float32)): + target_rotations = target_rotations @ to_opencv.T[None] # [T,3,3] + + from cosmos_framework.data.vfm.action.urdf_visualizer.ik_solver import ( + compute_fk_ee_poses, + compute_mujoco_geom_transforms, + solve_trajectory_ik, + ) + + joint_configs = solve_trajectory_ik( + self.mjcf_path, + target_positions, + gripper_openings=gripper_openings, + world_ee_orientations=target_rotations, + robot_name=self.robot_name, + ) + if joint_configs is None: + return None + + fk_pos, fk_rot = compute_fk_ee_poses(self.mjcf_path, joint_configs, robot_name=self.robot_name) + mesh_transforms, _, _, named_frames = compute_mujoco_geom_transforms(self.mjcf_path, joint_configs) + public_frames: FrameSeries = {} if named_frames is None else dict(named_frames) + public_frames[f"ik:{self.ee_frame_name}"] = list(zip(fk_pos, fk_rot, strict=True)) + + mesh_transforms = _apply_base_to_mesh_transforms(mesh_transforms, base_pose) + public_frames = _apply_base_to_named_frames(public_frames, base_pose) + return RobotTrajectory(mesh_transforms=mesh_transforms, named_frames=public_frames) diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_action.py b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_action.py new file mode 100644 index 0000000000000000000000000000000000000000..6fa7f6d5f20d2e380858e0cbeb00f4cb373378fd --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_action.py @@ -0,0 +1,480 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Canonical 57D action representation with explicit input formats. + +57D layout:: + + [ego(9) | R_wrist(9) | R_fingers(15) | L_wrist(9) | L_fingers(15)] + +Each 9D SE(3) slot is ``[pos(3) + rot6d(6)]``. +Each finger slot is 3D (position in wrist-local frame), 5 fingers × 3D = 15D. + +Any supported action format is converted to ``UnifiedAction(action_57d, mask)`` +before the viewer processes it. The mask explicitly declares which slots are valid. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum +from typing import Any + +import numpy as np +import torch + +from cosmos_framework.utils import log +from cosmos_framework.data.vfm.action.pose_utils import convert_rotation + +FINGER_NAMES = ("thumb", "index", "middle", "ring", "pinky") +ALL_FINGERS = (True, True, True, True, True) +NO_FINGERS = (False, False, False, False, False) + + +class ActionFormat(str, Enum): + """Explicit raw action layouts supported by the viewer pipeline.""" + + EGO_9D = "9d" + SINGLE_ARM_10D = "10d" + DUAL_ARM_20D = "20d" + UNIFIED_57D = "57d" + + @property + def expected_dim(self) -> int: + """Return the exact trailing dimension required by this format.""" + return { + ActionFormat.EGO_9D: 9, + ActionFormat.SINGLE_ARM_10D: 10, + ActionFormat.DUAL_ARM_20D: 20, + ActionFormat.UNIFIED_57D: 57, + }[self] + + +# ─── Data Structures ───────────────────────────────────────────────────────── + + +@dataclass +class Action57DMask: + """Per-component validity for the 57D layout. + + ``right_fingers`` / ``left_fingers`` are tuples of 5 bools + (thumb, index, middle, ring, pinky) — supports any combination from + 2-finger grippers to full 5-finger hands. + """ + + ego: bool = False + right_wrist: bool = False + right_fingers: tuple[bool, ...] = NO_FINGERS + left_wrist: bool = False + left_fingers: tuple[bool, ...] = NO_FINGERS + + +@dataclass +class UnifiedAction: + """Canonical 57D action for the viewer pipeline. + + ``action`` is always shape ``(T, 57)`` with invalid slots zero-padded. + ``gripper_right`` / ``gripper_left`` carry auxiliary scalar gripper data + for embodiments that don't map to finger positions (V-shape visualisation). + """ + + action: np.ndarray # (T, 57) + mask: Action57DMask + gripper_right: np.ndarray | None = None # (T,) scalar 0-1 + gripper_left: np.ndarray | None = None # (T,) scalar 0-1 + + +@dataclass +class SceneState: + """Render-ready world-space geometry reconstructed from ``UnifiedAction``. + + Contract: + - all SE(3) trajectories live in one shared ``scene_world`` frame + - fingertip positions are world-space if present + - gripper signals are scalar open/close values sampled at ``T+1`` frames + """ + + mask: Action57DMask = field(default_factory=Action57DMask) + # Absolute SE(3) trajectories — (T+1, 4, 4) + ego_poses: np.ndarray | None = None + right_poses: np.ndarray | None = None + left_poses: np.ndarray | None = None + # World-space fingertip positions — (T+1, 5, 3) + right_fingers: np.ndarray | None = None + left_fingers: np.ndarray | None = None + # Scalar gripper — (T+1,) + gripper_right: np.ndarray | None = None + gripper_left: np.ndarray | None = None + # Metadata + video: np.ndarray | None = None # (T+1, H, W, 3) uint8 + action_raw: np.ndarray | None = None # canonical 57D action tensor for display + T: int = 0 + # FK mesh animation: raw (T, nq) joint configs populated by datasets that + # perform EE conversion internally (e.g. robomind-ur). When set, the renderer + # uses these for FK mesh animation instead of running IK on right_poses. + joint_configs: np.ndarray | None = None + + +# ─── Converters ─────────────────────────────────────────────────────────────── + + +def to_unified_from_57d(action: np.ndarray) -> UnifiedAction: + """57D hand_pose → passthrough, all 5 slots valid.""" + return UnifiedAction( + action=action.astype(np.float32), + mask=Action57DMask( + ego=True, + right_wrist=True, + right_fingers=ALL_FINGERS, + left_wrist=True, + left_fingers=ALL_FINGERS, + ), + ) + + +def to_unified_from_10d(action: np.ndarray) -> UnifiedAction: + """10D single arm ``[pos(3)+rot6d(6)+grip(1)]`` → right wrist + gripper.""" + T = action.shape[0] + a = np.zeros((T, 57), dtype=np.float32) # [T,57] + a[:, 9:18] = action[:, :9] + return UnifiedAction( + action=a, + mask=Action57DMask(right_wrist=True), + gripper_right=action[:, 9].astype(np.float32), + ) + + +def to_unified_from_20d(action: np.ndarray) -> UnifiedAction: + """20D dual arm ``[left(10) | right(10)]`` → both wrists + both grippers. + + Data layout: ``[L_pos(3) + L_rot6d(6) + L_grip(1) | R_pos(3) + R_rot6d(6) + R_grip(1)]``. + Maps left arm → left wrist slot [33:42], right arm → right wrist slot [9:18]. + """ + T = action.shape[0] + a = np.zeros((T, 57), dtype=np.float32) # [T,57] + a[:, 33:42] = action[:, :9] # left arm → left wrist slot [33:42] + a[:, 9:18] = action[:, 10:19] # right arm → right wrist slot [9:18] + return UnifiedAction( + action=a, + mask=Action57DMask(right_wrist=True, left_wrist=True), + gripper_right=action[:, 19].astype(np.float32), # right arm gripper + gripper_left=action[:, 9].astype(np.float32), # left arm gripper + ) + + +def to_unified_from_9d(action: np.ndarray) -> UnifiedAction: + """9D camera/AV ``[pos(3)+rot6d(6)]`` → ego only.""" + T = action.shape[0] + a = np.zeros((T, 57), dtype=np.float32) # [T,57] + a[:, 0:9] = action[:, :9] + return UnifiedAction( + action=a, + mask=Action57DMask(ego=True), + ) + + +def _validate_action_shape(action: np.ndarray, action_format: ActionFormat) -> None: + """Raise when a raw action tensor does not match its declared format.""" + if action.ndim != 2: + raise ValueError(f"Expected a rank-2 action array, got shape {action.shape}") + actual_dim = int(action.shape[-1]) + expected_dim = action_format.expected_dim + if actual_dim != expected_dim: + raise ValueError(f"Action format {action_format.value} expects trailing dim {expected_dim}, got {actual_dim}") + + +def to_unified(action: np.ndarray, action_format: ActionFormat) -> UnifiedAction: + """Convert one explicit raw action format into ``UnifiedAction``.""" + _validate_action_shape(action, action_format) + if action_format is ActionFormat.UNIFIED_57D: + return to_unified_from_57d(action) + if action_format is ActionFormat.DUAL_ARM_20D: + return to_unified_from_20d(action) + if action_format is ActionFormat.EGO_9D: + return to_unified_from_9d(action) + if action_format is ActionFormat.SINGLE_ARM_10D: + return to_unified_from_10d(action) + raise ValueError(f"Unsupported action format: {action_format}") + + +def _pos_rot6d_to_mat(se3: np.ndarray) -> np.ndarray: + """Convert ``(N, 9)`` pos+rot6d to ``(N, 4, 4)`` SE(3) matrices.""" + N = se3.shape[0] + pos = se3[:, :3] + r6 = se3[:, 3:9] + + col0 = r6[:, :3].copy() + col0_norm = np.linalg.norm(col0, axis=-1, keepdims=True) + 1e-8 + col0 = col0 / col0_norm + + col1 = r6[:, 3:6] - np.sum(r6[:, 3:6] * col0, axis=-1, keepdims=True) * col0 + col1_norm = np.linalg.norm(col1, axis=-1, keepdims=True) + 1e-8 + col1 = col1 / col1_norm + + col2 = np.cross(col0, col1) + + mats = np.tile(np.eye(4, dtype=np.float32), (N, 1, 1)) + mats[:, :3, 0] = col0 + mats[:, :3, 1] = col1 + mats[:, :3, 2] = col2 + mats[:, :3, 3] = pos + return mats + + +def _chain_se3( + deltas: np.ndarray, + initial_pose: np.ndarray | None = None, + pose_convention: str = "backward_framewise", +) -> np.ndarray: + """Chain ``(T, 9)`` relative deltas into ``(T+1, 4, 4)`` absolute poses. + + For ``backward_framewise``: ``P_{t+1} = P_t @ delta_t``. + For ``absolute``: each row is already an absolute pose (no chaining). + """ + T = deltas.shape[0] + delta_mats = _pos_rot6d_to_mat(deltas) + + if initial_pose is None: + initial_pose = np.eye(4, dtype=np.float32) + else: + initial_pose = initial_pose.astype(np.float32) + + poses = np.empty((T + 1, 4, 4), dtype=np.float32) + poses[0] = initial_pose + + if pose_convention == "absolute": + poses[1:] = delta_mats + else: + for t in range(T): + poses[t + 1] = poses[t] @ delta_mats[t] + + return poses + + +def _extract_fingers(raw: np.ndarray) -> np.ndarray: + """``(T, 15)`` → ``(T+1, 5, 3)`` with first frame duplicated.""" + T = raw.shape[0] + fingers = raw.reshape(T, 5, 3).astype(np.float32) # [T,5,3] + return np.concatenate([fingers[:1], fingers], axis=0) + + +def _to_numpy_float32(value: object) -> np.ndarray: + """Convert a tensor-like value to a float32 NumPy array.""" + + if isinstance(value, torch.Tensor): + return value.detach().cpu().numpy().astype(np.float32) + return np.asarray(value, dtype=np.float32) + + +def _quat_xyzw_to_rotmat(q: np.ndarray) -> np.ndarray: + """Convert ``(N, 4)`` xyzw quaternions to ``(N, 3, 3)`` rotation matrices.""" + x, y, z, w = q[:, 0], q[:, 1], q[:, 2], q[:, 3] + R = np.zeros((len(q), 3, 3), dtype=np.float32) + R[:, 0, 0] = 1 - 2 * (y * y + z * z) + R[:, 0, 1] = 2 * (x * y - z * w) + R[:, 0, 2] = 2 * (x * z + y * w) + R[:, 1, 0] = 2 * (x * y + z * w) + R[:, 1, 1] = 1 - 2 * (x * x + z * z) + R[:, 1, 2] = 2 * (y * z - x * w) + R[:, 2, 0] = 2 * (x * z - y * w) + R[:, 2, 1] = 2 * (y * z + x * w) + R[:, 2, 2] = 1 - 2 * (x * x + y * y) + return R + + +def _build_absolute_from_overlay(sample: dict) -> dict[str, np.ndarray] | None: + """Build absolute world-frame poses from HandPoseDataset overlay data. + + Returns None if overlay keys are missing. + """ + raw_cam_pos = sample.get("raw_cam_position") + if raw_cam_pos is None: + return None + + cam_pos = _to_numpy_float32(raw_cam_pos) # [T+1,3] + cam_rot_q = _to_numpy_float32(sample["raw_cam_rotation"]) # [T+1,4] + right_3d = _to_numpy_float32(sample["raw_cam_right_3d"]) # [T+1,63] + left_3d = _to_numpy_float32(sample["raw_cam_left_3d"]) # [T+1,63] + right_rot = _to_numpy_float32(sample["raw_cam_right_rot"]) # [T+1,84] + left_rot = _to_numpy_float32(sample["raw_cam_left_rot"]) # [T+1,84] + + T1 = cam_pos.shape[0] + FTIP = [4, 8, 12, 16, 20] + + # Camera c2w (world frame) + cam_c2w = np.tile(np.eye(4, dtype=np.float32), (T1, 1, 1)) # [T+1,4,4] + cam_c2w[:, :3, 3] = cam_pos + cam_c2w[:, :3, :3] = _quat_xyzw_to_rotmat(cam_rot_q) + + def _wrist_world(pos_63, rot_84): + wrist_pos = pos_63[:, :3] + wrist_q = rot_84.reshape(T1, 21, 4)[:, 0] + wrist_cam = np.tile(np.eye(4, dtype=np.float32), (T1, 1, 1)) # [T+1,4,4] + wrist_cam[:, :3, 3] = wrist_pos + wrist_cam[:, :3, :3] = _quat_xyzw_to_rotmat(wrist_q) + return cam_c2w @ wrist_cam + + def _fingers_world(pos_63): + joints = pos_63.reshape(T1, 21, 3)[:, FTIP] + R = cam_c2w[:, :3, :3] + t = cam_c2w[:, :3, 3] + return np.einsum("tij,tfj->tfi", R, joints) + t[:, None, :] # [T+1,5,3] + + return { + "ego_poses": cam_c2w, + "right_wrist_poses": _wrist_world(right_3d, right_rot), + "left_wrist_poses": _wrist_world(left_3d, left_rot), + "right_fingers": _fingers_world(right_3d), + "left_fingers": _fingers_world(left_3d), + } + + +def _build_libero_absolute_from_state(sample: dict) -> dict[str, np.ndarray] | None: + """Build absolute LIBERO right-wrist poses from ``observation.state``. + + LIBERO policy actions are normalized robosuite controller commands rather + than meter-scale SE(3) deltas. For visualization, the raw state sequence is + the correct source of metric end-effector poses. + """ + if sample.get("source_repo_id") is None or sample.get("state") is None: + return None + if "libero" not in str(sample.get("source_repo_id", "")).lower(): + return None + + from cosmos_framework.data.vfm.action.libero_pose_utils import build_libero_abs_pose + + to_opencv = str(sample.get("pose_coordinate_frame", "native")) == "opencv" + right_poses = build_libero_abs_pose(sample["state"], to_opencv=to_opencv) + return {"right_wrist_poses": right_poses.astype(np.float32, copy=False)} + + +# ─── Scene State Builder ───────────────────────────────────────────────────── + + +def build_scene_state( + unified: UnifiedAction, + initial_pose: np.ndarray | None = None, + initial_pose_right: np.ndarray | None = None, + initial_pose_left: np.ndarray | None = None, + right_base_pose: np.ndarray | None = None, + left_base_pose: np.ndarray | None = None, + pose_convention: str = "backward_framewise", + sample: dict | None = None, +) -> SceneState: + """Reconstruct a canonical world-space ``SceneState`` from ``UnifiedAction``. + + Chains SE(3) deltas for valid mask slots. If ``sample`` contains overlay + data (HandPoseDataset raw camera/joint fields), overrides with absolute + world-frame poses. + + Args: + unified: Canonical 57D action with mask. + initial_pose: Default initial pose for all slots. + initial_pose_right: Override for right wrist (dual arm). + initial_pose_left: Override for left wrist (dual arm). + right_base_pose: Right-arm base pose that maps arm-local trajectories into ``scene_world``. + left_base_pose: Left-arm base pose that maps arm-local trajectories into ``scene_world``. + pose_convention: Pose convention for SE(3) chaining. + sample: Raw dataset sample (for overlay data). + """ + + def _apply_pose_base(poses: np.ndarray | None, base_pose: np.ndarray | None) -> np.ndarray | None: + if poses is None or base_pose is None: + return poses + return np.einsum("ij,njk->nik", base_pose, poses).astype(np.float32) # [T+1,4,4] + + def _fingers_local_to_world( + fingers_local: np.ndarray | None, + wrist_poses_world: np.ndarray | None, + ) -> np.ndarray | None: + if fingers_local is None: + return None + if wrist_poses_world is None: + raise ValueError("Finger trajectories require matching wrist poses to build world-space SceneState") + wrist_rot = wrist_poses_world[:, :3, :3].astype(np.float32) # [T+1,3,3] + wrist_pos = wrist_poses_world[:, :3, 3].astype(np.float32) # [T+1,3] + return np.einsum("tij,tfj->tfi", wrist_rot, fingers_local) + wrist_pos[:, None, :] # [T+1,5,3] + + mask = unified.mask + action = unified.action + state = SceneState(mask=mask) + + ip_default = initial_pose if initial_pose is not None else np.eye(4, dtype=np.float32) + ip_right = initial_pose_right if initial_pose_right is not None else ip_default + ip_left = initial_pose_left if initial_pose_left is not None else ip_default + + if mask.ego: + state.ego_poses = _chain_se3(action[:, 0:9], ip_default, pose_convention) + if mask.right_wrist: + state.right_poses = _chain_se3(action[:, 9:18], ip_right, pose_convention) + if any(mask.right_fingers): + state.right_fingers = _extract_fingers(action[:, 18:33]) + if mask.left_wrist: + state.left_poses = _chain_se3(action[:, 33:42], ip_left, pose_convention) + if any(mask.left_fingers): + state.left_fingers = _extract_fingers(action[:, 42:57]) + + if unified.gripper_right is not None: + g = unified.gripper_right + state.gripper_right = np.concatenate([[g[0]], g]).astype(np.float32, copy=False) # [T+1] + if unified.gripper_left is not None: + g = unified.gripper_left + state.gripper_left = np.concatenate([[g[0]], g]).astype(np.float32, copy=False) # [T+1] + + abs_data = _build_absolute_from_overlay(sample) if sample is not None else None + if abs_data is not None: + state.ego_poses = abs_data["ego_poses"] + state.right_poses = abs_data["right_wrist_poses"] + state.left_poses = abs_data["left_wrist_poses"] + state.right_fingers = abs_data["right_fingers"] + state.left_fingers = abs_data["left_fingers"] + log.info( + f"Overlay absolute mode | ego range: " + f"[{abs_data['ego_poses'][:, :3, 3].min():.3f}, " + f"{abs_data['ego_poses'][:, :3, 3].max():.3f}] | " + f"R wrist[0]: {abs_data['right_wrist_poses'][0, :3, 3]}" + ) + else: + state.right_poses = _apply_pose_base(state.right_poses, right_base_pose) + state.left_poses = _apply_pose_base(state.left_poses, left_base_pose) + + libero_abs_data = _build_libero_absolute_from_state(sample) if sample is not None else None + if libero_abs_data is not None: + state.right_poses = libero_abs_data["right_wrist_poses"] + + state.right_fingers = _fingers_local_to_world(state.right_fingers, state.right_poses) + state.left_fingers = _fingers_local_to_world(state.left_fingers, state.left_poses) + + state.action_raw = unified.action.astype(np.float32) + state.T = action.shape[0] + return state + + +# ─── Video Extraction ───────────────────────────────────────────────────────── + + +def get_video_from_sample(sample: dict) -> np.ndarray | None: + """Extract video frames from a dataset sample. + + Returns ``(T+1, H, W, 3)`` uint8 array, or None. + """ + video = sample.get("video") + if video is None: + return None + if isinstance(video, torch.Tensor): + video = video.numpy() + + if video.ndim == 4: + C, T_dim, H, W = video.shape + if C in (1, 3) and T_dim > 3: + video = np.transpose(video, (1, 2, 3, 0)) + elif video.shape[1] in (1, 3) and T_dim <= 3: + video = np.transpose(video, (0, 2, 3, 1)) + + if video.dtype in (np.float32, np.float64): + video = np.clip(video * 255, 0, 255).astype(np.uint8) + + if video.ndim == 4 and video.shape[-1] == 1: + video = np.repeat(video, 3, axis=-1) + + return video diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_renderer.py b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_renderer.py new file mode 100644 index 0000000000000000000000000000000000000000..a176307612dc7d51a8081cd3c8002832a9118838 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/unified_renderer.py @@ -0,0 +1,836 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Mask-driven renderer for ``SceneState`` — the unified 57D viewer backend. + +Owns all viser scene handles. Draws only what the mask declares valid. +No action-format branching — everything goes through ``SceneState``. +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np + +from cosmos_framework.utils import log +from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import RobotSceneModel +from cosmos_framework.data.vfm.action.urdf_visualizer.unified_action import FINGER_NAMES, SceneState + + +class UnifiedRenderer: + """Mask-driven 3D renderer for the unified 57D action viewer.""" + + # ── Palette ── + TIP_RADIUS = 0.0042 + FRUSTUM_SCALE = 0.10 + EGO_AXIS_LENGTH = 0.04 + EGO_AXIS_RADIUS = 0.002 + EGO_TRAJ_LENGTH_REFERENCE = 0.55 + EGO_TRAJ_SCALE_MIN = 0.35 + EGO_TRAJ_SCALE_MAX = 1.80 + EGO_TRAJ_LINE_WIDTH = 1.0 + EE_TRAJ_LINE_WIDTH = 1.5 + HAND_AXIS_LENGTH = 0.05 + HAND_AXIS_RADIUS = 0.002 + ROBOT_BODY_AXIS_LENGTH = 0.03 + ROBOT_BODY_AXIS_RADIUS = 0.0012 + ROBOT_SITE_AXIS_LENGTH = 0.022 + ROBOT_SITE_AXIS_RADIUS = 0.0009 + COLOR_EGO = (52, 152, 219) # blue + COLOR_EGO_TOP = (231, 76, 60) # red + COLOR_RIGHT = (243, 156, 18) # orange + COLOR_LEFT = (155, 89, 182) # purple + FINGER_COLORS = [ + (231, 76, 60), + (241, 196, 15), + (46, 204, 113), + (52, 152, 219), + (155, 89, 182), + ] + + @staticmethod + def _soften_color(color: tuple[int, int, int], mix: float = 0.35) -> tuple[int, int, int]: + """Blend a color toward white for less visually dominant trajectories.""" + base = np.asarray(color, dtype=np.float32) + soft = base * (1.0 - mix) + 255.0 * mix + rounded = soft.round() + return int(rounded[0]), int(rounded[1]), int(rounded[2]) + + def _p(self, path: str) -> str: + """Prepend the instance name_prefix to a scene path.""" + return f"{self._name_prefix}{path}" + + def __init__( + self, + server, + name_prefix: str = "", + palette: dict | None = None, + ): + import viser.transforms as vtf + + self.server = server + self.vtf = vtf + self.state: SceneState | None = None + self._entry: Any = None + self._axis_scale = 1.0 + self._name_prefix = name_prefix + + self._ego_frustum_scale_base = self.FRUSTUM_SCALE + self._ego_axis_length_base = self.EGO_AXIS_LENGTH + self._ego_axis_radius_base = self.EGO_AXIS_RADIUS + self._ego_frustum_fov = np.deg2rad(60.0) + self._ego_frustum_aspect = 4 / 3 + + # Per-instance color palette (allows GT and pred to use different colors) + pal = palette or {} + self.COLOR_EGO = pal.get("ego", type(self).COLOR_EGO) + self.COLOR_EGO_TOP = pal.get("ego_top", type(self).COLOR_EGO_TOP) + self.COLOR_RIGHT = pal.get("right", type(self).COLOR_RIGHT) + self.COLOR_LEFT = pal.get("left", type(self).COLOR_LEFT) + + # ── Ego ── + self.ego_frame = server.scene.add_frame( + self._p("/ego/frame"), axes_length=self.EGO_AXIS_LENGTH, axes_radius=self.EGO_AXIS_RADIUS + ) + self.ego_frustum = server.scene.add_line_segments( + self._p("/ego/frustum"), + points=self._make_ego_frustum_wireframe_points(self._ego_frustum_fov, self._ego_frustum_aspect), + colors=np.array(self.COLOR_EGO, dtype=np.uint8), + scale=self.FRUSTUM_SCALE, + line_width=2.0, + wxyz=(1.0, 0.0, 0.0, 0.0), + position=(0.0, 0.0, 0.0), + ) + self.ego_frustum_up = server.scene.add_line_segments( + self._p("/ego/frustum_up"), + points=self._make_ego_frustum_up_points(self._ego_frustum_fov, self._ego_frustum_aspect), + colors=np.array(self.COLOR_EGO_TOP, dtype=np.uint8), + line_width=3.0, + scale=self.FRUSTUM_SCALE, + wxyz=(1.0, 0.0, 0.0, 0.0), + position=(0.0, 0.0, 0.0), + ) + self.ego_traj = server.scene.add_spline_catmull_rom( + self._p("/ego/traj"), + positions=np.zeros([2, 3], dtype=np.float32), + color=self._soften_color(self.COLOR_EGO), + line_width=self.EGO_TRAJ_LINE_WIDTH, + ) + + # ── Right effector ── + self.right_frame = server.scene.add_frame( + self._p("/right/frame"), axes_length=self.HAND_AXIS_LENGTH, axes_radius=self.HAND_AXIS_RADIUS + ) + self.right_traj = server.scene.add_spline_catmull_rom( + self._p("/right/traj"), + positions=np.zeros([2, 3], dtype=np.float32), + color=self._soften_color(self.COLOR_RIGHT), + line_width=self.EE_TRAJ_LINE_WIDTH, + ) + self.right_ee = server.scene.add_point_cloud( + self._p("/right/point"), + points=np.zeros([1, 3], dtype=np.float32), + colors=np.array([self.COLOR_RIGHT], dtype=np.uint8), + point_size=0.015, + point_shape="circle", + ) + self.right_fingers = [ + server.scene.add_icosphere( + self._p(f"/right/finger_{FINGER_NAMES[i]}"), + radius=self.TIP_RADIUS, + color=self.FINGER_COLORS[i], + position=(0.0, 0.0, 0.0), + ) + for i in range(5) + ] + self.right_gripper_tips = [ + server.scene.add_icosphere( + self._p(f"/right/gripper_tip_{side}"), + radius=self.TIP_RADIUS, + color=self.FINGER_COLORS[i], + position=(0.0, 0.0, 0.0), + ) + for i, side in enumerate(("thumb", "index")) + ] + + # ── Left effector ── + self.left_frame = server.scene.add_frame( + self._p("/left/frame"), axes_length=self.HAND_AXIS_LENGTH, axes_radius=self.HAND_AXIS_RADIUS + ) + self.left_traj = server.scene.add_spline_catmull_rom( + self._p("/left/traj"), + positions=np.zeros([2, 3], dtype=np.float32), + color=self._soften_color(self.COLOR_LEFT), + line_width=self.EE_TRAJ_LINE_WIDTH, + ) + self.left_ee = server.scene.add_point_cloud( + self._p("/left/point"), + points=np.zeros([1, 3], dtype=np.float32), + colors=np.array([self.COLOR_LEFT], dtype=np.uint8), + point_size=0.015, + point_shape="circle", + ) + self.left_fingers = [ + server.scene.add_icosphere( + self._p(f"/left/finger_{FINGER_NAMES[i]}"), + radius=self.TIP_RADIUS, + color=self.FINGER_COLORS[i], + position=(0.0, 0.0, 0.0), + ) + for i in range(5) + ] + self.left_gripper_tips = [ + server.scene.add_icosphere( + self._p(f"/left/gripper_tip_{side}"), + radius=self.TIP_RADIUS, + color=self.FINGER_COLORS[i], + position=(0.0, 0.0, 0.0), + ) + for i, side in enumerate(("thumb", "index")) + ] + + # ── IK robot meshes ── + self.robot_right: list = [] + self.robot_left: list = [] + self._robot_frame_handles_right: dict[str, Any] = {} + self._robot_frame_handles_left: dict[str, Any] = {} + self._current_robot: Any | None = None + self._robot_scene_model: RobotSceneModel | None = None + self._ik_right: list | None = None + self._ik_left: list | None = None + self._robot_frames_right: dict[str, list[tuple[np.ndarray, np.ndarray]]] | None = None + self._robot_frames_left: dict[str, list[tuple[np.ndarray, np.ndarray]]] | None = None + self._robot_link_names: list[str] = [] + self._robot_local_transforms: list[np.ndarray] = [] + + # ── Video panel (set by viewer.py) ── + self._cam_handle: Any | None = None + + self.hide_all() + + # ─── Per-Episode ────────────────────────────────────────────────────────── + + def load( + self, + state: SceneState, + entry: Any, + to_opencv: np.ndarray | dict[str, np.ndarray] | None = None, + ): + """Load a new episode. Rebuild trajectories, robot meshes, and IK. + + Args: + state: Reconstructed ``SceneState`` with absolute poses. + entry: ``DatasetEntry`` with robot_name, max_finger_width, etc. + to_opencv: Optional native-to-OpenCV rotation. + """ + self.state = state + self._entry = entry + self._to_opencv = to_opencv if to_opencv is not None else np.eye(3, dtype=np.float32) + self._ego_frustum_fov = np.deg2rad(entry.camera_fov_deg) + self._ego_frustum_aspect = float(entry.camera_aspect) + self.ego_frustum.points = self._make_ego_frustum_wireframe_points( + self._ego_frustum_fov, + self._ego_frustum_aspect, + ) + self.ego_frustum_up.points = self._make_ego_frustum_up_points( + self._ego_frustum_fov, + self._ego_frustum_aspect, + ) + self._update_ego_visual_scale(state.ego_poses if state.mask.ego else None) + self.update_axis_scale(self._axis_scale) + self.hide_all() + + # ── Robot meshes + IK from canonical world-space SceneState ── + self._ik_right = None + self._ik_left = None + self._robot_frames_right = None + self._robot_frames_left = None + if entry.robot_name: + self._load_robot_and_ik(state, entry) + + # Rebuild trajectory splines from canonical world-space SceneState. + if state.mask.ego and state.ego_poses is not None: + self._rebuild_traj(self.ego_traj, state.ego_poses, self.COLOR_EGO) + if state.mask.right_wrist and state.right_poses is not None: + self._rebuild_traj(self.right_traj, state.right_poses, self.COLOR_RIGHT) + if state.mask.left_wrist and state.left_poses is not None: + self._rebuild_traj(self.left_traj, state.left_poses, self.COLOR_LEFT) + + def set_video_panel(self, panel_handle: Any | None) -> None: + """Attach the optional GUI image panel used for episode video.""" + self._cam_handle = panel_handle + + # ─── Per-Frame ──────────────────────────────────────────────────────────── + + def update(self, t: int, show: dict): + """Update all scene elements for time step ``t``. + + Args: + t: Frame index (0-based). + show: Visibility flags: ``frames``, ``traj``, ``fingertips``, ``ego``, ``robot``. + """ + state = self.state + if state is None: + return + mask = state.mask + + # ── Ego ── + self._update_ego(t, state.ego_poses, mask.ego and show.get("ego", False), show) + + # ── Right effector ── + self._update_effector( + t, + state.right_poses, + mask.right_wrist, + self.right_frame, + self.right_ee, + self.right_traj, + show, + ) + self._update_fingers( + t, + state.right_fingers, + mask.right_fingers, + self.right_fingers, + show, + ) + self._update_gripper( + t, + state.right_poses, + state.gripper_right, + mask.right_wrist, + mask.right_fingers, + self.right_gripper_tips, + show, + ) + + # ── Left effector ── + self._update_effector( + t, + state.left_poses, + mask.left_wrist, + self.left_frame, + self.left_ee, + self.left_traj, + show, + ) + self._update_fingers( + t, + state.left_fingers, + mask.left_fingers, + self.left_fingers, + show, + ) + self._update_gripper( + t, + state.left_poses, + state.gripper_left, + mask.left_wrist, + mask.left_fingers, + self.left_gripper_tips, + show, + ) + + # ── IK robot meshes ── + self._update_robot(t, show) + + # ── Video panel ── + if self._cam_handle is not None and state.video is not None and t < len(state.video): + self._cam_handle.image = state.video[t] + + # ─── Action Text ────────────────────────────────────────────────────────── + + def format_action_text(self, t: int) -> str: + """Return a formatted string showing 57D action values at step ``t``. + + Always shows the full 57D layout. Validity indicator (✓/·) in front of + each component based on the mask. + """ + state = self.state + if state is None or state.action_raw is None: + return "" + if t == 0: + return "*t=0: anchor pose (identity)*" + if (t - 1) >= len(state.action_raw): + return "" + + a = state.action_raw[t - 1] # always 57D (zero-padded) + mask = state.mask + + def _fmt(v): + return " ".join(f"{x:+.4f}" for x in v) + + def _v(active): + return "✓" if active else "·" + + gr = a[18:33].reshape(5, 3) + gl = a[42:57].reshape(5, 3) + + # Gripper auxiliary values (not in 57D vector) + grip_r_str = "" + grip_l_str = "" + if state.gripper_right is not None and t < len(state.gripper_right): + grip_r_str = f" ✓ gripper {state.gripper_right[t]:+.4f}" + if state.gripper_left is not None and t < len(state.gripper_left): + grip_l_str = f" ✓ gripper {state.gripper_left[t]:+.4f}" + + parts = [ + f"step {t - 1} → {t} (57D)", + "═" * 36, + f"{_v(mask.ego)} Ego pos [0:3] {_fmt(a[0:3])}", + f" {' ' * 1} rot [3:9] {_fmt(a[3:9])}", + "", + f"{_v(mask.right_wrist)} R wrist pos [9:12] {_fmt(a[9:12])}", + f" {' ' * 1} rot [12:18] {_fmt(a[12:18])}", + f" R fingers [18:33]", + ] + for i, name in enumerate(FINGER_NAMES): + parts.append(f" {_v(mask.right_fingers[i])} {name:7s} {_fmt(gr[i])}") + if grip_r_str: + parts.append(grip_r_str) + + parts += [ + "", + f"{_v(mask.left_wrist)} L wrist pos [33:36] {_fmt(a[33:36])}", + f" {' ' * 1} rot [36:42] {_fmt(a[36:42])}", + f" L fingers [42:57]", + ] + for i, name in enumerate(FINGER_NAMES): + parts.append(f" {_v(mask.left_fingers[i])} {name:7s} {_fmt(gl[i])}") + if grip_l_str: + parts.append(grip_l_str) + + return str("```\n" + "\n".join(parts) + "\n```") + + # ─── Private: Effector ──────────────────────────────────────────────────── + + @staticmethod + def _make_ego_frustum_wireframe_points(fov: float, aspect: float) -> np.ndarray: + """Build wireframe segments for the ego camera frustum.""" + half_height = float(np.tan(fov / 2.0)) + half_width = float(aspect) * half_height + top_left = np.array([-half_width, -half_height, 1.0], dtype=np.float32) + top_right = np.array([half_width, -half_height, 1.0], dtype=np.float32) + bottom_right = np.array([half_width, half_height, 1.0], dtype=np.float32) + bottom_left = np.array([-half_width, half_height, 1.0], dtype=np.float32) + origin = np.array([0.0, 0.0, 0.0], dtype=np.float32) + return np.array( + [ + [origin, top_left], + [origin, top_right], + [origin, bottom_right], + [origin, bottom_left], + [top_left, top_right], + [top_right, bottom_right], + [bottom_right, bottom_left], + [bottom_left, top_left], + ], + dtype=np.float32, + ) + + @staticmethod + def _make_ego_frustum_up_points(fov: float, aspect: float) -> np.ndarray: + """Build a red segment that marks the frustum's far-edge upright tick.""" + half_height = float(np.tan(fov / 2.0)) + _ = aspect + top_y = -half_height + return np.array( + [[[0.0, top_y, 1.0], [0.0, top_y * 1.18, 1.0]]], + dtype=np.float32, + ) + + def _update_ego(self, t: int, poses: np.ndarray | None, active: bool, show: dict) -> None: + if active and poses is not None and t < len(poses): + pos = poses[t, :3, 3] + rot = poses[t, :3, :3] + wxyz = self.vtf.SO3.from_matrix(rot).wxyz + self.ego_frame.position = pos + self.ego_frame.wxyz = wxyz + self.ego_frame.visible = show.get("frames", True) + self.ego_frustum.position = pos + self.ego_frustum.wxyz = wxyz + self.ego_frustum.visible = True + self.ego_frustum_up.position = pos + self.ego_frustum_up.wxyz = wxyz + self.ego_frustum_up.visible = True + self.ego_traj.visible = show.get("traj", True) + else: + self.ego_frame.visible = False + self.ego_frustum.visible = False + self.ego_frustum_up.visible = False + self.ego_traj.visible = False + + def _update_effector(self, t, poses, active, frame, ee, traj, show): + if active and poses is not None and t < len(poses): + pos = poses[t, :3, 3] + rot = poses[t, :3, :3] + frame.position = pos + frame.wxyz = self.vtf.SO3.from_matrix(rot).wxyz + frame.visible = show.get("frames", True) + ee.points = pos[None] + ee.visible = True + traj.visible = show.get("traj", True) + else: + frame.visible = False + ee.visible = False + traj.visible = False + + # ─── Private: Fingers ───────────────────────────────────────────────────── + + def _update_fingers(self, t, fingers, finger_mask, handles, show): + if fingers is None or t >= len(fingers): + for h in handles: + h.visible = False + return + if not show.get("fingertips", True): + for h in handles: + h.visible = False + return + + g = fingers[t] # (5, 3) + for fi, h in enumerate(handles): + if finger_mask[fi]: + h.position = g[fi].astype(np.float32) + h.visible = True + else: + h.visible = False + + # ─── Private: Gripper ───────────────────────────────────────────────────── + + def _update_gripper(self, t, poses, gripper, wrist_active, finger_mask, handle, show): + has_fingers = any(finger_mask) + if not wrist_active or has_fingers or gripper is None or poses is None or t >= len(gripper) or t >= len(poses): + for tip_handle in handle: + tip_handle.visible = False + return + if not show.get("fingertips", True): + for tip_handle in handle: + tip_handle.visible = False + return + pos = poses[t, :3, 3].astype(np.float32) + rot = poses[t, :3, :3].astype(np.float32) + g = float(gripper[t]) + mfw = getattr(self._entry, "max_finger_width", 0.05) + half_w = g * mfw / 2.0 + finger_len = 0.06 + tip_l = pos + rot @ np.array([half_w, 0, finger_len], dtype=np.float32) + tip_r = pos + rot @ np.array([-half_w, 0, finger_len], dtype=np.float32) + for tip_handle, tip in zip(handle, (tip_l, tip_r), strict=True): + tip_handle.position = tip + tip_handle.visible = True + + # ─── Private: Trajectory ────────────────────────────────────────────────── + + def _rebuild_traj(self, traj_handle, poses, color): + """Rebuild a trajectory spline from absolute poses.""" + positions = poses[:, :3, 3].astype(np.float32) + if len(positions) < 2: + traj_handle.visible = False + return + line_width = self.EGO_TRAJ_LINE_WIDTH if traj_handle is self.ego_traj else self.EE_TRAJ_LINE_WIDTH + # Remove and recreate to avoid stale color issues in viser + name = traj_handle.name if hasattr(traj_handle, "name") else "/tmp/traj" + traj_handle.remove() + new_handle = self.server.scene.add_spline_catmull_rom( + name, + positions=positions, + color=self._soften_color(color), + line_width=line_width, + ) + # Update the reference — need to figure out which attribute to update + if traj_handle is self.ego_traj: + self.ego_traj = new_handle + elif traj_handle is self.right_traj: + self.right_traj = new_handle + elif traj_handle is self.left_traj: + self.left_traj = new_handle + + @staticmethod + def _trajectory_length(poses: np.ndarray | None) -> float: + """Compute the total path length of a pose trajectory.""" + if poses is None or len(poses) < 2: + return 0.0 + positions = poses[:, :3, 3].astype(np.float32) + deltas = np.diff(positions, axis=0) + return float(np.linalg.norm(deltas, axis=1).sum()) + + def _update_ego_visual_scale(self, poses: np.ndarray | None) -> None: + """Scale the ego camera frame/frustum from the episode trajectory length.""" + traj_length = self._trajectory_length(poses) + if traj_length <= 0.0: + traj_scale = 1.0 + else: + traj_scale = float( + np.clip( + traj_length / self.EGO_TRAJ_LENGTH_REFERENCE, + self.EGO_TRAJ_SCALE_MIN, + self.EGO_TRAJ_SCALE_MAX, + ) + ) + self._ego_frustum_scale_base = self.FRUSTUM_SCALE * traj_scale + self._ego_axis_length_base = self.EGO_AXIS_LENGTH * traj_scale + self._ego_axis_radius_base = self.EGO_AXIS_RADIUS * traj_scale + + # ─── Private: IK Robot ──────────────────────────────────────────────────── + + @classmethod + def _robot_frame_dims(cls, frame_key: str) -> tuple[float, float]: + """Return axis length/radius for one robot debug frame.""" + if frame_key.startswith("site:"): + return cls.ROBOT_SITE_AXIS_LENGTH, cls.ROBOT_SITE_AXIS_RADIUS + return cls.ROBOT_BODY_AXIS_LENGTH, cls.ROBOT_BODY_AXIS_RADIUS + + @staticmethod + def _clear_robot_frame_handles(handles: dict[str, Any]) -> None: + """Remove all robot debug frame handles in one arm.""" + for handle in handles.values(): + handle.remove() + handles.clear() + + def _robot_frame_selector_key(self, arm: str, frame_key: str) -> str: + """Return the GUI selector key for one robot debug frame.""" + if self._robot_frames_left is None and arm == "right": + return frame_key + return f"{arm}/{frame_key}" + + def get_robot_frame_selectors(self) -> list[tuple[str, str]]: + """Return selector keys and checkbox labels for available robot frames.""" + selectors = [] + for arm, frames in [("right", self._robot_frames_right), ("left", self._robot_frames_left)]: + if frames is None: + continue + for frame_key in sorted(frames): + selector_key = self._robot_frame_selector_key(arm, frame_key) + if self._robot_frames_left is None and arm == "right": + label = frame_key + else: + label = selector_key + selectors.append((selector_key, label)) + return selectors + + def _rebuild_robot_frame_handles( + self, + arm: str, + frames: dict[str, list[tuple[np.ndarray, np.ndarray]]] | None, + ) -> None: + """Recreate robot debug frame handles for one arm.""" + handles = self._robot_frame_handles_right if arm == "right" else self._robot_frame_handles_left + self._clear_robot_frame_handles(handles) + if frames is None: + return + for frame_key in sorted(frames): + kind, name = frame_key.split(":", 1) + axes_length, axes_radius = self._robot_frame_dims(frame_key) + handles[frame_key] = self.server.scene.add_frame( + self._p(f"/robot_{arm}_frames/{kind}/{name}"), + axes_length=axes_length * self._axis_scale, + axes_radius=axes_radius * self._axis_scale, + ) + log.info(f"Loaded {len(handles)} robot debug frames for {arm} arm") + + def _update_robot_debug_frames(self, t: int, show: dict) -> None: + """Update body/site coordinate frame overlays for both arms.""" + filters = show.get("robot_frame_filters", {}) + for arm, handles, frames in [ + ("right", self._robot_frame_handles_right, self._robot_frames_right), + ("left", self._robot_frame_handles_left, self._robot_frames_left), + ]: + for frame_key, handle in handles.items(): + poses = frames.get(frame_key) if frames is not None else None + selector_key = self._robot_frame_selector_key(arm, frame_key) + frame_enabled = bool(filters.get(selector_key, False)) + if poses is not None and t < len(poses) and frame_enabled: + pos, rot = poses[t] + handle.position = pos + handle.wxyz = self.vtf.SO3.from_matrix(rot).wxyz + handle.visible = True + else: + handle.visible = False + + def _load_robot_and_ik(self, state: SceneState, entry: Any): + """Load robot meshes and solve IK for the episode.""" + # Determine robot config key (single vs dual) + is_dual = state.mask.left_wrist and entry.dual_base_left is not None + robot_key = (entry.robot_name, "dual") if is_dual else entry.robot_name + + if self._robot_scene_model is None or self._robot_scene_model.robot_name != entry.robot_name: + try: + self._robot_scene_model = RobotSceneModel(entry.robot_name) + except Exception as e: + log.warning(f"RobotSceneModel unavailable for {entry.robot_name}: {e}") + self._robot_scene_model = None + return + + # Only reload meshes if robot changed + if self._current_robot != robot_key: + for h in self.robot_right + self.robot_left: + h.remove() + self.robot_right = [] + self.robot_left = [] + self._clear_robot_frame_handles(self._robot_frame_handles_right) + self._clear_robot_frame_handles(self._robot_frame_handles_left) + + if self._robot_scene_model is None: + self._current_robot = robot_key + return + + right_meshes = self._robot_scene_model.get_home_meshes(entry.dual_base_right if is_dual else None) + for name, mesh, transform in right_meshes: + h = self.server.scene.add_mesh_trimesh( + self._p(f"/robot_right/{name}"), + mesh=mesh, + ) + h.position = transform[:3, 3] + h.wxyz = self.vtf.SO3.from_matrix(transform[:3, :3]).wxyz + self.robot_right.append(h) + + if is_dual: + left_meshes = self._robot_scene_model.get_home_meshes(entry.dual_base_left) + for name, mesh, transform in left_meshes: + h = self.server.scene.add_mesh_trimesh( + self._p(f"/robot_left/{name}"), + mesh=mesh, + ) + h.position = transform[:3, 3] + h.wxyz = self.vtf.SO3.from_matrix(transform[:3, :3]).wxyz + self.robot_left.append(h) + + self._current_robot = robot_key + log.info(f"Loaded {len(right_meshes)} meshes for {entry.robot_name}") + + if self._robot_scene_model is None: + return + + # Joint-position datasets (e.g. robomind-ur): bypass IK, use FK directly + if state.joint_configs is not None: + from cosmos_framework.data.vfm.action.urdf_visualizer.ik_solver import compute_mujoco_geom_transforms + from cosmos_framework.data.vfm.action.urdf_visualizer.robot_scene_model import get_mjcf_path + + try: + mjcf_path = get_mjcf_path(entry.robot_name) + transforms, _, _fk_ee_poses, robot_frames = compute_mujoco_geom_transforms( + mjcf_path, state.joint_configs + ) + self._ik_right = transforms + self._robot_frames_right = robot_frames + self._rebuild_robot_frame_handles("right", robot_frames) + log.info(f"FK geom transforms computed for {len(transforms)} frames ({entry.robot_name})") + except Exception as e: + log.warning(f"FK failed for {entry.robot_name}: {e}") + import traceback + + traceback.print_exc() + return + + # Right arm IK + if state.right_poses is not None: + try: + right_result = self._robot_scene_model.solve_visual_trajectory( + state.right_poses, + gripper_openings=state.gripper_right, + to_opencv=self._to_opencv, + base_pose=entry.dual_base_right if is_dual else None, + ) + if right_result is not None: + self._ik_right = right_result.mesh_transforms + self._robot_frames_right = right_result.named_frames + self._rebuild_robot_frame_handles("right", self._robot_frames_right) + else: + self._ik_right = None + self._robot_frames_right = None + self._rebuild_robot_frame_handles("right", None) + except Exception as e: + log.warning(f"IK failed (right): {e}") + self._ik_right = None + self._robot_frames_right = None + self._rebuild_robot_frame_handles("right", None) + else: + self._ik_right = None + self._robot_frames_right = None + self._rebuild_robot_frame_handles("right", None) + + # Left arm IK (dual only) + if is_dual and state.left_poses is not None: + try: + left_result = self._robot_scene_model.solve_visual_trajectory( + state.left_poses, + gripper_openings=state.gripper_left, + to_opencv=self._to_opencv, + base_pose=entry.dual_base_left, + ) + if left_result is not None: + self._ik_left = left_result.mesh_transforms + self._robot_frames_left = left_result.named_frames + self._rebuild_robot_frame_handles("left", self._robot_frames_left) + else: + self._ik_left = None + self._robot_frames_left = None + self._rebuild_robot_frame_handles("left", None) + except Exception as e: + log.warning(f"IK failed (left): {e}") + self._ik_left = None + self._robot_frames_left = None + self._rebuild_robot_frame_handles("left", None) + else: + self._ik_left = None + self._robot_frames_left = None + self._rebuild_robot_frame_handles("left", None) + + def _update_robot(self, t: int, show: dict): + vis = show.get("robot", True) + for handles, ik in [(self.robot_right, self._ik_right), (self.robot_left, self._ik_left)]: + for idx, h in enumerate(handles): + if vis and ik is not None and t < len(ik) and idx < len(ik[t]): + p, m = ik[t][idx] + h.position = p + h.wxyz = self.vtf.SO3.from_matrix(m).wxyz + h.visible = True + else: + h.visible = False + self._update_robot_debug_frames(t, show) + + # ─── Visibility ─────────────────────────────────────────────────────────── + + def hide_all(self): + """Hide every scene element.""" + for attr in [ + self.ego_frame, + self.ego_frustum, + self.ego_frustum_up, + self.ego_traj, + self.right_frame, + self.right_ee, + self.right_traj, + self.left_frame, + self.left_ee, + self.left_traj, + ]: + attr.visible = False + for h in self.right_fingers + self.left_fingers: + h.visible = False + for h in self.right_gripper_tips + self.left_gripper_tips: + h.visible = False + for h in self.robot_right + self.robot_left: + h.visible = False + for handle in list(self._robot_frame_handles_right.values()) + list(self._robot_frame_handles_left.values()): + handle.visible = False + + def update_axis_scale(self, scale: float): + """Update coordinate frame axis size and effector point size.""" + self._axis_scale = scale + s = scale + self.ego_frame.axes_length = self._ego_axis_length_base * s + self.ego_frame.axes_radius = self._ego_axis_radius_base * s + self.ego_frustum.scale = self._ego_frustum_scale_base * s + self.ego_frustum_up.scale = self._ego_frustum_scale_base * s + for frame in (self.right_frame, self.left_frame): + frame.axes_length = self.HAND_AXIS_LENGTH * s + frame.axes_radius = self.HAND_AXIS_RADIUS * s + for ee in (self.right_ee, self.left_ee): + ee.point_size = 0.015 * s + for handles in (self._robot_frame_handles_right, self._robot_frame_handles_left): + for frame_key, handle in handles.items(): + axes_length, axes_radius = self._robot_frame_dims(frame_key) + handle.axes_length = axes_length * s + handle.axes_radius = axes_radius * s diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ur5e_robotiq_2f85.xml b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ur5e_robotiq_2f85.xml new file mode 100644 index 0000000000000000000000000000000000000000..9f477627d8ba57fb6ecf87c8449827477457f566 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/ur5e_robotiq_2f85.xml @@ -0,0 +1,326 @@ + + + + + + + diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/urdf_loader.py b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/urdf_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..32b78f29821d5d78aed55ff2c0d9223ab5fe5789 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/urdf_loader.py @@ -0,0 +1,919 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Robot mesh loading from mujoco_menagerie. + +Downloads robot MJCF/URDF/mesh assets on first use, caches in ~/.cache/mujoco_menagerie/. +Each loader returns (meshes, ee_home_pose) where: + - meshes: list of (name, trimesh.Trimesh, 4x4_transform) + - ee_home_pose: (4, 4) float32 — EE pose at home configuration +""" + +from __future__ import annotations + +import os +import xml.etree.ElementTree as ET +from dataclasses import dataclass +from functools import lru_cache +from pathlib import Path +from typing import Any + +import numpy as np +from scipy.spatial.transform import Rotation as R + +from cosmos_framework.utils import log + +_MENAGERIE_REPO = "https://github.com/google-deepmind/mujoco_menagerie" +_ROBOT_CACHE_DIR = Path("/tmp") / "mujoco_menagerie" +_FALLBACK_MESH_COLORS = np.asarray( + [ + [52, 152, 219, 255], + [243, 156, 18, 255], + [155, 89, 182, 255], + [46, 204, 113, 255], + [231, 76, 60, 255], + [26, 188, 156, 255], + [241, 196, 15, 255], + [52, 73, 94, 255], + [127, 140, 141, 255], + [211, 84, 0, 255], + [39, 174, 96, 255], + [41, 128, 185, 255], + ], + dtype=np.uint8, +) +_UNSPECIFIED_RGBA_VALUES = ( + np.asarray([0.5, 0.5, 0.5, 1.0], dtype=np.float32), + np.asarray([1.0, 1.0, 1.0, 1.0], dtype=np.float32), + # Some autogenerated URDF/MJCF assets use pink as a placeholder/default + # material. Treat it as unspecified so meshes do not all render pink. + np.asarray([1.0, 0.5, 1.0, 1.0], dtype=np.float32), +) +_UNSPECIFIED_MATERIAL_NAMES = {"", "default", "default_material"} + + +def _fallback_mesh_color(name: str, index: int = 0) -> np.ndarray: + """Return a stable fallback RGBA color for a mesh without material color.""" + + key = name or f"mesh_{index}" + palette_index = sum((char_idx + 1) * ord(char) for char_idx, char in enumerate(key)) + return _FALLBACK_MESH_COLORS[(palette_index + index) % len(_FALLBACK_MESH_COLORS)].copy() + + +def _is_unspecified_rgba(rgba: np.ndarray) -> bool: + """Return whether RGBA is an uninformative default color.""" + + return any(np.allclose(rgba, default_rgba, atol=1e-4) for default_rgba in _UNSPECIFIED_RGBA_VALUES) + + +def _apply_mesh_color(mesh: Any, rgba: np.ndarray) -> None: + """Apply one RGBA color to all mesh faces for viser/trimesh rendering.""" + + face_count = int(len(getattr(mesh, "faces", []))) + if face_count > 0: + mesh.visual.face_colors = np.tile(np.asarray(rgba, dtype=np.uint8), (face_count, 1)) + + +# ── Robot-specific configuration ────────────────────────────────────────────── + +# EE frame name candidates (tried in order by IK solver) +# --------------------------------------------------------------------------- +# End-effector frame references per dataset +# --------------------------------------------------------------------------- +# Each dataset records EE poses at a specific URDF link. The IK solver must +# target the same frame (via ``ee_frame`` or the ``_EE_FRAME_CANDIDATES`` +# fallback list). Below we document, for every robot: +# (a) which link the dataset records at, with upstream source reference, +# (b) what that link physically represents (flange, gripper body, or TCP), +# (c) any orientation transforms applied during recording. +# +# ── Google Robot (fractal / RT-1) ────────────────────────────────────────── +# Recorded link: ``link_gripper_tcp`` +# The TFDS observation field is ``base_pose_tool_reached`` (shape (7,)). +# Source: https://www.tensorflow.org/datasets/catalog/fractal20220817_data +# Meaning: TCP — a calibrated tool-center-point 164 mm past the wrist +# (``link_gripper``), roughly at the fingertip. The name contains "tool" +# and SimplerEnv's real→sim calibration targets the same link: +# ``self.ee_link_name = "link_gripper_tcp"`` +# Source: https://github.com/simpler-env/ManiSkill2_real2sim/blob/ef7a4d4/mani_skill2_real2sim/agents/configs/google_robot/defaults.py#L141 +# Orientation: recorded as-is from the URDF FK (quaternion). No extra +# rotation applied. +# Re-referenced to: ``link_gripper`` (gripper body / last actuated link) +# via ``_TCP_TO_FLANGE`` in the dataset class (−164 mm along local z). +# See ``fractal.py`` for the full transform and rationale. +# Our config: ``"ee_frame": "link_gripper"`` (explicit). +# Important MJCF note: MuJoCo Menagerie places ``base_link`` at +# ``pos="0 0 0.06205"`` under ``worldbody``. Pinocchio's +# ``buildModelFromMJCF()`` omits this global root transform, so IK / dataset +# poses live in a root-free world that is 62.05 mm lower than raw MuJoCo +# body/site poses. We therefore strip the same root transform from MuJoCo +# mesh/body/site outputs before rendering or comparing them. +# +# ── WidowX (bridge) ─────────────────────────────────────────────────────── +# Recorded link: ``ee_gripper_link`` +# The bridge data collection uses the Interbotix SDK's Modern Robotics FK +# (``FKinSpace(M, Slist, q)``). The M matrix for wx250s terminates at +# ``ee_gripper_link`` — confirmed by the Interbotix source comment and +# numerically: ``wx250s.M[0,3] = 0.458325`` matches ``ee_gripper_link`` +# position exactly (vs ``gripper_link`` at 0.3648). +# Source: https://github.com/Interbotix/interbotix_ros_toolboxes/blob/main/interbotix_xs_toolbox/interbotix_xs_modules/src/interbotix_xs_modules/mr_descriptions.py +# Line 1: "Note that the end-effector is positioned at '/ee_gripper_link'" +# SimplerEnv confirms: ``self.ee_link_name = "ee_gripper_link"`` +# Source: https://github.com/simpler-env/ManiSkill2_real2sim/blob/ef7a4d4/mani_skill2_real2sim/agents/configs/widowx/defaults.py#L87 +# Meaning: Interbotix-defined "end-effector" reference point, 93.6 mm past +# the wrist (``gripper_link``), 27.6 mm past the finger pivot +# (``fingers_link``). This is roughly the grasp center between the +# finger pads — *not* the fingertip. +# Orientation: the bridge data collection applies ``DEFAULT_ROTATION`` to +# the FK orientation before recording (in ``transform2state``): +# ``euler = rotationMatrixToEulerAngles(rot.dot(default_rotation.T))`` +# where ``DEFAULT_ROTATION = [[0,0,1],[0,1,0],[-1,0,0]]``. +# Source: https://github.com/rail-berkeley/bridge_data_robot/blob/main/widowx_envs/widowx_controller/src/widowx_controller/widowx_controller.py#L44 +# Re-referenced to: ``gripper_link`` (wrist rotate body / last actuated link) +# via ``_TCP_TO_FLANGE`` in the dataset class (−93.6 mm along local x). +# See ``bridge_orig_lerobot_dataset.py`` for the full transform and rationale. +# Our config: ``"ee_frame": "gripper_link"`` (explicit). +# +# ── Franka + Robotiq 2F-85 (DROID) ──────────────────────────────────────── +# Recorded link: ``panda_link8`` (= Franka flange) +# DROID's ``get_robot_state()`` calls Polymetis FK: +# ``pos, quat = self._robot.robot_model.forward_kinematics(joint_positions)`` +# Source: https://github.com/droid-dataset/droid/blob/main/droid/franka/robot.py +# Polymetis is configured with ``ee_link_name: panda_link8``: +# Source: https://github.com/facebookresearch/fairo/blob/main/polymetis/polymetis/conf/robot_model/franka_panda.yaml +# Meaning: flange — the bare mounting plate at the end of the Franka arm, +# *before* the Robotiq gripper. +# MJCF body: ``panda_hand`` in the composite model (pos 0 0 0.107 +# above ``link7``, identity quat). This is at the panda_link8 +# (flange) position with link7 orientation, matching the URDF. +# ``link7`` is panda_link7 (wrist), *not* the flange. +# Orientation: Polymetis reports FK orientation at ``panda_link8`` frame +# as-is. No extra rotation applied. +# Current viewer convention: all Franka-backed datasets reuse this DROID +# composite model, so ``franka_panda`` is the only supported Franka robot +# config in this module. +# Our config: ``"ee_frame": "panda_hand"`` (explicit). +# --------------------------------------------------------------------------- + +_EE_FRAME_CANDIDATES = [ + "link_gripper_tcp", # SimplerEnv Google Robot (calibrated TCP at fingertip) + "link_gripper", # Google Robot (Menagerie / fallback) + "ee_gripper_link", # SimplerEnv WidowX (end of finger chain) + "gripper_link", # WidowX (MuJoCo body name) + "wx250s/gripper_link", # WidowX Menagerie (prefixed body name) + "gripper", # Google Robot (Menagerie site) + "hand", # Franka Panda (standard gripper) + "attachment", # Franka + Robotiq + "attachment_site", # UR5e (mujoco_menagerie) + "wx250s/gripper_link", # WidowX 250S (Menagerie) + "ee_link", # generic + "tool0", # generic industrial +] + +# Robot-specific configs: menagerie name, MJCF filename, joint info, finger range. +ROBOT_CONFIGS = { + "google_robot": { + "menagerie": "google_robot", + "mjcf": "robot.xml", + # EE: link_gripper (gripper body / last actuated link). + # The Menagerie model has link_gripper directly (no TCP frame). + # The dataset applies _TCP_TO_FLANGE to shift from the recorded + # link_gripper_tcp to link_gripper, so poses already target this frame. + # MuJoCo Menagerie also adds a global worldbody -> base_link offset + # (+62.05 mm in z). Pinocchio's MJCF importer omits that root transform, + # so the viewer removes it from MuJoCo outputs to keep meshes / body + # frames aligned with dataset poses and IK outputs. + "ee_frame": "link_gripper", + "pinocchio_removed_root_body": "base_link", + "arm_joints": [ + "joint_torso", + "joint_shoulder", + "joint_bicep", + "joint_elbow", + "joint_forearm", + "joint_wrist", + "joint_gripper", + ], + "n_arm_joints": 7, + "finger_joint_names": ["joint_finger_left", "joint_finger_right"], + "finger_min": 0.01, + "finger_max": 1.30, + "finger_close_is_max": True, + "camera_body": None, + }, + "franka_panda": { + "menagerie": "droid_franka_robotiq", + "mjcf": "panda_updated_robotiq_2f85.xml", + # EE: panda_link8 (flange) — the bare mounting plate, *not* the + # gripper TCP. MJCF body: ``panda_hand`` in the composite model + # (pos 0 0 0.107 above link7, identity quat). + + "ee_frame": "panda_hand", + "arm_joints": [ + "joint1", + "joint2", + "joint3", + "joint4", + "joint5", + "joint6", + "joint7", + ], + "n_arm_joints": 7, + "finger_min": 0.0, + "finger_max": 0.8, + "finger_close_is_max": True, + # All Robotiq 2F-85 chain joints (some share names with bodies in + # pinocchio MJCF parse — must keep all to avoid buildReducedModel error) + "finger_joint_names": [ + "right_driver_joint", + "left_driver_joint", + "left_spring_link_joint", + "left_follower", + "right_spring_link_joint", + "right_follower_joint", + ], + "camera_body": None, + }, + "widowx": { + "menagerie": "trossen_wx250s", + "mjcf": "wx250s.xml", + # EE: gripper_link (wrist rotate body / last actuated link). + # The Menagerie model has gripper_link directly. + # The dataset applies _TCP_TO_FLANGE to shift from the recorded + # ee_gripper_link to gripper_link, so poses already target this frame. + "ee_frame": "gripper_link", + "arm_joints": [ + "waist", + "shoulder", + "elbow", + "forearm_roll", + "wrist_angle", + "wrist_rotate", + ], + "n_arm_joints": 6, + "finger_min": 0.015, + "finger_max": 0.037, + "finger_close_is_max": False, + "finger_joint_names": ["left_finger", "right_finger"], + "camera_body": None, + }, + "ur5e": { + "menagerie": "ur5e_robotiq", + "mjcf": "ur5e_robotiq_2f85.xml", + # EE: attachment_site — the UR5e flange site where the Robotiq mounts. + # joint_configs[:, 6] raw UR gripper maps directly to Robotiq ctrl: + # ctrl = raw * 255 (0=open, 255=closed). + "ee_frame": "attachment_site", + "n_arm_joints": 6, + "finger_min": 0.0, + "finger_max": 255.0, + "finger_close_is_max": False, + "camera_body": None, + }, + +} + + +def get_urdf_path(robot_name: str) -> str | None: + """Get URDF path for a robot if available, else None (falls back to MJCF in caller).""" + cfg = ROBOT_CONFIGS.get(robot_name) + if cfg is None: + return None + + menagerie_name = cfg.get("menagerie") + if menagerie_name is None: + return None + mjcf_dir = _ensure_robot_assets(menagerie_name) + + urdf_filename = cfg.get("urdf") + if urdf_filename: + urdf_path = mjcf_dir / urdf_filename + if urdf_path.exists(): + return str(urdf_path) + + urdfs = list(mjcf_dir.glob("*.urdf")) + if urdfs: + return str(urdfs[0]) + + return None + + +def get_mjcf_path(robot_name: str) -> str: + """Get MJCF path for a robot, downloading from mujoco_menagerie if needed.""" + cfg = ROBOT_CONFIGS.get(robot_name) + if cfg is None: + raise ValueError(f"Unknown robot: {robot_name}. Available: {list(ROBOT_CONFIGS.keys())}") + + menagerie_name = cfg.get("menagerie") + if menagerie_name is None: + raise FileNotFoundError(f"Robot {robot_name!r} does not have a MuJoCo MJCF asset.") + mjcf_filename = cfg.get("mjcf", "robot.xml") + + mjcf_dir = _ensure_robot_assets(menagerie_name) + mjcf_path = mjcf_dir / mjcf_filename + if mjcf_path.exists(): + return str(mjcf_path) + + fallback = mjcf_dir / "robot.xml" + if fallback.exists(): + return str(fallback) + + raise FileNotFoundError(f"MJCF not found: {mjcf_path} (also tried robot.xml)") + + +# ── Asset download ─────────────────────────────────────────────────────────── + + +def _ensure_robot_assets(robot_name: str) -> Path: + """Ensure robot MJCF and mesh assets are available locally. + + Downloads from mujoco_menagerie GitHub repo if not found. + For composite robots (``droid_franka_robotiq``), downloads from both + MuJoCo Menagerie and MuJoCo Playground and merges assets. + + Assets are cached in ~/.cache/mujoco_menagerie//. + + Returns: + Path to the cached MJCF directory (caller picks the right .xml). + """ + cached_dir = _ROBOT_CACHE_DIR / robot_name + if cached_dir.exists(): + return cached_dir + + # Composite: Franka arm + Robotiq 2F-85 gripper (DROID setup) + if robot_name == "droid_franka_robotiq": + return _build_droid_franka_robotiq() + + # Composite: UR5e arm + Robotiq 2F-85 gripper (RoboMIND UR setup) + if robot_name == "ur5e_robotiq": + return _build_ur5e_robotiq() + + log.info(f"Downloading {robot_name} assets from mujoco_menagerie...") + _download_menagerie_model(robot_name) + return cached_dir + + +def _download_menagerie_model(model_name: str) -> Path: + """Download a single model from MuJoCo Menagerie. Returns cached path.""" + cached_dir = _ROBOT_CACHE_DIR / model_name + if cached_dir.exists(): + return cached_dir + + import shutil + import subprocess + import tempfile + + _ROBOT_CACHE_DIR.mkdir(parents=True, exist_ok=True) + with tempfile.TemporaryDirectory() as tmpdir: + subprocess.run( + ["git", "clone", "--depth=1", "--filter=blob:none", "--sparse", _MENAGERIE_REPO, tmpdir], + check=True, + capture_output=True, + ) + subprocess.run( + ["git", "sparse-checkout", "set", model_name], + cwd=tmpdir, + check=True, + capture_output=True, + ) + src = Path(tmpdir) / model_name + dst = _ROBOT_CACHE_DIR / model_name + if dst.exists(): + shutil.rmtree(dst) + shutil.copytree(src, dst) + + log.info(f"Cached {model_name} assets at {dst}") + return dst + + +def _build_droid_franka_robotiq() -> Path: + """Prepare composite Franka + Robotiq 2F-85 MJCF for DROID visualization. + + The hand-tuned composite XML is committed in the repo alongside this + module (``droid_franka_robotiq_2f85.xml``). This function downloads the + mesh assets from MuJoCo Menagerie (``franka_emika_panda`` + + ``robotiq_2f85_v4``) and copies them together with the XML into a cache + directory that MuJoCo can load from. + """ + import shutil + + log.info("Preparing composite Franka + Robotiq 2F-85 (committed XML + Menagerie assets)...") + + franka_dir = _download_menagerie_model("franka_emika_panda") + robotiq_dir = _download_menagerie_model("robotiq_2f85_v4") + + dst = _ROBOT_CACHE_DIR / "droid_franka_robotiq" + dst.mkdir(parents=True, exist_ok=True) + assets_dir = dst / "assets" + assets_dir.mkdir(exist_ok=True) + + for f in (franka_dir / "assets").iterdir(): + shutil.copy2(f, assets_dir / f.name) + for f in (robotiq_dir / "assets").iterdir(): + shutil.copy2(f, assets_dir / f.name) + + committed_xml = Path(__file__).parent / "droid_franka_robotiq_2f85.xml" + out_path = dst / "panda_updated_robotiq_2f85.xml" + shutil.copy2(committed_xml, out_path) + + log.info(f"Prepared composite Franka + Robotiq 2F-85 at {dst}") + return dst + + +def _build_ur5e_robotiq() -> Path: + """Prepare composite UR5e + Robotiq 2F-85 MJCF for RoboMIND UR visualization. + + The hand-tuned composite XML is committed in the repo alongside this + module (``ur5e_robotiq_2f85.xml``). This function downloads the mesh + assets from MuJoCo Menagerie (``universal_robots_ur5e`` + + ``robotiq_2f85_v4``) and copies them together with the XML into a cache + directory that MuJoCo can load from. + """ + import shutil + + log.info("Preparing composite UR5e + Robotiq 2F-85 (committed XML + Menagerie assets)...") + + ur5e_dir = _download_menagerie_model("universal_robots_ur5e") + robotiq_dir = _download_menagerie_model("robotiq_2f85_v4") + + dst = _ROBOT_CACHE_DIR / "ur5e_robotiq" + dst.mkdir(parents=True, exist_ok=True) + assets_dir = dst / "assets" + assets_dir.mkdir(exist_ok=True) + + for f in (ur5e_dir / "assets").iterdir(): + shutil.copy2(f, assets_dir / f.name) + for f in (robotiq_dir / "assets").iterdir(): + shutil.copy2(f, assets_dir / f.name) + + committed_xml = Path(__file__).parent / "ur5e_robotiq_2f85.xml" + shutil.copy2(committed_xml, dst / "ur5e_robotiq_2f85.xml") + + log.info(f"Prepared composite UR5e + Robotiq 2F-85 at {dst}") + return dst + + +# ── Robot loaders ──────────────────────────────────────────────────────────── + + +def _load_google_robot() -> tuple[list, np.ndarray]: + """Load Google Robot from MuJoCo Menagerie. + + Uses the official google_robot model from + https://github.com/google-deepmind/mujoco_menagerie/tree/main/google_robot + which has visual OBJ meshes for accurate rendering. + """ + import mujoco + + cfg = ROBOT_CONFIGS["google_robot"] + mjcf_dir = _ensure_robot_assets(cfg["menagerie"]) + mjcf_path = mjcf_dir / cfg["mjcf"] + model = mujoco.MjModel.from_xml_path(str(mjcf_path)) + data = mujoco.MjData(model) + mujoco.mj_forward(model, data) + + meshes = _extract_mujoco_meshes(model, data) + world_correction = get_mujoco_to_pinocchio_world_transform(model, data, "google_robot") + if not np.allclose(world_correction, np.eye(4, dtype=np.float32)): + meshes = _apply_world_transform_to_meshes(meshes, world_correction) + + # Find EE pose at link_gripper + ee_pose = np.eye(4, dtype=np.float32) + for i in range(model.nbody): + name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_BODY, i) or "" + if name == "link_gripper": + ee_pose[:3, 3] = data.xpos[i].astype(np.float32) + ee_pose[:3, :3] = data.xmat[i].reshape(3, 3).astype(np.float32) + break + ee_pose = (world_correction @ ee_pose).astype(np.float32) + + log.info(f"Google Robot (Menagerie) loaded: {len(meshes)} meshes, EE pos={ee_pose[:3, 3]}") + return meshes, ee_pose + + +def _load_franka_panda() -> tuple[list, np.ndarray]: + """Load Franka Panda + Robotiq 2F-85 gripper (DROID variant). + + Uses the composite model built from MuJoCo Menagerie + (``franka_emika_panda`` arm + ``robotiq_2f85_v4`` gripper). + """ + import mujoco + + mjcf_dir = _ensure_robot_assets("droid_franka_robotiq") + mjcf_path = mjcf_dir / "panda_updated_robotiq_2f85.xml" + model = mujoco.MjModel.from_xml_path(str(mjcf_path)) + data = mujoco.MjData(model) + + # Franka home configuration + home_qpos = np.array([0.0, -0.78, 0.0, -2.36, 0.0, 1.57, 0.78]) + data.qpos[: len(home_qpos)] = home_qpos + mujoco.mj_forward(model, data) + + meshes = _extract_mujoco_meshes(model, data) + + # EE pose: use the "gripper" site (on the Robotiq base), or fall + # back to "base_mount" / "attachment" body if site isn't found. + ee_pose = np.eye(4, dtype=np.float32) + for si in range(model.nsite): + name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_SITE, si) or "" + if name == "gripper": + ee_pose[:3, 3] = data.site_xpos[si].astype(np.float32) + ee_pose[:3, :3] = data.site_xmat[si].reshape(3, 3).astype(np.float32) + break + else: + # Fallback to body search + for i in range(model.nbody): + name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_BODY, i) or "" + if name in ("base_mount", "attachment"): + ee_pose[:3, 3] = data.xpos[i].astype(np.float32) + ee_pose[:3, :3] = data.xmat[i].reshape(3, 3).astype(np.float32) + break + + log.info(f"Franka Panda + Robotiq loaded: {len(meshes)} meshes, EE pos={ee_pose[:3, 3]}") + return meshes, ee_pose + + +def _load_widowx() -> tuple[list, np.ndarray]: + """Load WidowX 250S from MuJoCo Menagerie (trossen_wx250s). + + Uses the official Trossen Robotics WX250S model from + https://github.com/google-deepmind/mujoco_menagerie/tree/main/trossen_wx250s + which has visual OBJ meshes for accurate rendering. + """ + import mujoco + + cfg = ROBOT_CONFIGS["widowx"] + mjcf_dir = _ensure_robot_assets(cfg["menagerie"]) + mjcf_path = mjcf_dir / cfg["mjcf"] + model = mujoco.MjModel.from_xml_path(str(mjcf_path)) + data = mujoco.MjData(model) + mujoco.mj_forward(model, data) + + meshes = _extract_mujoco_meshes(model, data) + + ee_pose = np.eye(4, dtype=np.float32) + for i in range(model.nbody): + name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_BODY, i) or "" + if name == "gripper_link": + ee_pose[:3, 3] = data.xpos[i].astype(np.float32) + ee_pose[:3, :3] = data.xmat[i].reshape(3, 3).astype(np.float32) + break + + log.info(f"WidowX 250S (Menagerie) loaded: {len(meshes)} meshes, EE pos={ee_pose[:3, 3]}") + return meshes, ee_pose + + +def _load_ur5e() -> tuple[list, np.ndarray]: + """Load UR5e + Robotiq 2F-85 composite from MuJoCo Menagerie.""" + import mujoco + + mjcf_dir = _ensure_robot_assets("ur5e_robotiq") + mjcf_path = mjcf_dir / "ur5e_robotiq_2f85.xml" + model = mujoco.MjModel.from_xml_path(str(mjcf_path)) + data = mujoco.MjData(model) + + # UR5e home: -90, -90, 90, -90, -90, 0 (degrees → radians); gripper open + home_qpos = np.array([-1.5708, -1.5708, 1.5708, -1.5708, -1.5708, 0.0]) + data.qpos[: len(home_qpos)] = home_qpos + mujoco.mj_forward(model, data) + + meshes = _extract_mujoco_meshes(model, data) + + # EE pose: use robotiq_base body (the flange-to-gripper attachment point) + ee_pose = np.eye(4, dtype=np.float32) + for i in range(model.nbody): + name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_BODY, i) or "" + if name == "robotiq_base": + ee_pose[:3, 3] = data.xpos[i].astype(np.float32) + ee_pose[:3, :3] = data.xmat[i].reshape(3, 3).astype(np.float32) + break + + log.info(f"UR5e+Robotiq loaded: {len(meshes)} meshes, EE pos={ee_pose[:3, 3]}") + return meshes, ee_pose + + +def _parse_urdf_origin(origin_element) -> np.ndarray: + transform = np.eye(4, dtype=np.float32) + if origin_element is None: + return transform + + xyz_text = origin_element.attrib.get("xyz", "0 0 0") + rpy_text = origin_element.attrib.get("rpy", "0 0 0") + xyz = np.asarray([float(value) for value in xyz_text.split()], dtype=np.float32) + rpy = np.asarray([float(value) for value in rpy_text.split()], dtype=np.float32) + transform[:3, :3] = R.from_euler("xyz", rpy).as_matrix().astype(np.float32) + transform[:3, 3] = xyz + return transform + + +def _parse_urdf_color(color_element: ET.Element | None) -> np.ndarray | None: + if color_element is None: + return None + rgba_text = color_element.attrib.get("rgba") + if not rgba_text: + return None + rgba = np.asarray([float(value) for value in rgba_text.split()], dtype=np.float32) + if rgba.shape != (4,): + return None + return (np.clip(rgba, 0.0, 1.0) * 255.0).astype(np.uint8) + + +def _parse_urdf_material_colors(root: ET.Element) -> dict[str, np.ndarray]: + material_colors: dict[str, np.ndarray] = {} + for material_element in root.findall("material"): + material_name = material_element.attrib.get("name", "") + color = _parse_urdf_color(material_element.find("color")) + if material_name and color is not None: + material_colors[material_name] = color + return material_colors + + +def _resolve_urdf_visual_color(parent_element: ET.Element, material_colors: dict[str, np.ndarray]) -> np.ndarray | None: + material_element = parent_element.find("material") + if material_element is None: + return None + + material_name = material_element.attrib.get("name", "") + color = _parse_urdf_color(material_element.find("color")) + if color is None and material_name: + color = material_colors.get(material_name) + if color is None: + return None + + color_unit = color.astype(np.float32) / 255.0 + if material_name.lower() in _UNSPECIFIED_MATERIAL_NAMES or _is_unspecified_rgba(color_unit): + return None + return color + + +# ── Helpers ────────────────────────────────────────────────────────────────── + + +def _get_visual_geom_ids(model) -> list[int]: + """Get visual geom IDs from a MuJoCo model. + + Strategy: + 1. Google Robot uses *_v suffix for visual meshes → use those + 2. Other robots (Franka, WidowX) use geom_group=2 for visual → use those + 3. Fallback: all mesh geoms + + Includes both mesh geoms and primitive geoms (box, cylinder, sphere) in group 2. + """ + import mujoco + + VISUAL_TYPES = { + mujoco.mjtGeom.mjGEOM_MESH, + mujoco.mjtGeom.mjGEOM_BOX, + mujoco.mjtGeom.mjGEOM_CYLINDER, + mujoco.mjtGeom.mjGEOM_SPHERE, + mujoco.mjtGeom.mjGEOM_CAPSULE, + mujoco.mjtGeom.mjGEOM_ELLIPSOID, + } + + v_ids = [] + group2_ids = [] + all_ids = [] + for gi in range(model.ngeom): + gtype = model.geom_type[gi] + if gtype not in VISUAL_TYPES: + continue + all_ids.append(gi) + if gtype == mujoco.mjtGeom.mjGEOM_MESH: + mesh_id = model.geom_dataid[gi] + mesh_name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_MESH, mesh_id) or "" + if mesh_name.endswith("_v"): + v_ids.append(gi) + if model.geom_group[gi] == 2: + group2_ids.append(gi) + + # Priority: _v suffix > group 2 > all + if v_ids: + return v_ids + if group2_ids: + return group2_ids + return all_ids + + +def _resolve_geom_color(model, gi: int, geom_name: str) -> np.ndarray: + """Resolve the effective RGBA color for a MuJoCo geom. + + Priority: + 1. If geom has a material with a diffuse texture → sample average texture color + 2. If geom has a material with non-default rgba → use mat_rgba + 3. If geom_rgba is non-default → use geom_rgba + 4. Fallback: deterministic per-geom color + + Returns (4,) uint8 RGBA. + """ + + rgba = model.geom_rgba[gi].copy() # (4,) float [0,1] + mat_id = model.geom_matid[gi] + + if mat_id >= 0: + # Check if material has a diffuse texture + tex_id = model.mat_texid[mat_id][1] # index 1 = diffuse texture + if tex_id >= 0: + # Sample average color from the texture + tex_adr = model.tex_adr[tex_id] + tex_w = model.tex_width[tex_id] + tex_h = model.tex_height[tex_id] + n_pixels = tex_w * tex_h + tex_data = model.tex_data[tex_adr : tex_adr + n_pixels * 3].reshape(n_pixels, 3) + avg_rgb = tex_data.mean(axis=0) / 255.0 + # Multiply by mat_rgba (which acts as a tint) + mat_rgba = model.mat_rgba[mat_id] + final = np.array( + [ + avg_rgb[0] * mat_rgba[0], + avg_rgb[1] * mat_rgba[1], + avg_rgb[2] * mat_rgba[2], + mat_rgba[3], + ] + ) + return (np.clip(final, 0, 1) * 255).astype(np.uint8) + + # No texture — use material rgba if it is informative. + mat_rgba = model.mat_rgba[mat_id].copy() + if not _is_unspecified_rgba(mat_rgba): + return (np.clip(mat_rgba, 0, 1) * 255).astype(np.uint8) + + if not _is_unspecified_rgba(rgba): + return (np.clip(rgba, 0, 1) * 255).astype(np.uint8) + + return _fallback_mesh_color(geom_name, gi) + + +def _extract_mujoco_meshes(model, data) -> list: + """Extract visual geom meshes from a MuJoCo model as trimesh objects. + + Uses the same visual geom filtering as ``ik_solver._get_visual_geom_ids`` + to ensure mesh ordering matches IK transform output. + + For geoms with primitive types (box, cylinder, etc.), generates the + corresponding trimesh primitive. + + Applies per-geom RGBA color from MuJoCo materials when available. + + Returns list of (geom_name, trimesh.Trimesh, 4x4_transform). + """ + import mujoco + import trimesh + + # Use the same visual filter as IK solver + visual_geom_ids = _get_visual_geom_ids(model) + + meshes = [] + for gi in visual_geom_ids: + gtype = model.geom_type[gi] + pos = data.geom_xpos[gi].copy() + rot = data.geom_xmat[gi].reshape(3, 3).copy() + + geom_name = mujoco.mj_id2name(model, mujoco.mjtObj.mjOBJ_GEOM, gi) or f"geom_{gi}" + face_color = _resolve_geom_color(model, gi, geom_name) + + mesh = None + if gtype == mujoco.mjtGeom.mjGEOM_MESH: + mesh_id = model.geom_dataid[gi] + vert_start = model.mesh_vertadr[mesh_id] + vert_count = model.mesh_vertnum[mesh_id] + face_start = model.mesh_faceadr[mesh_id] + face_count = model.mesh_facenum[mesh_id] + verts = model.mesh_vert[vert_start : vert_start + vert_count].copy() + faces = model.mesh_face[face_start : face_start + face_count].copy() + mesh = trimesh.Trimesh(vertices=verts, faces=faces) + _apply_mesh_color(mesh, face_color) + elif gtype == mujoco.mjtGeom.mjGEOM_BOX: + size = model.geom_size[gi].copy() + mesh = trimesh.creation.box(extents=size * 2) + _apply_mesh_color(mesh, face_color) + elif gtype == mujoco.mjtGeom.mjGEOM_SPHERE: + mesh = trimesh.creation.icosphere(radius=model.geom_size[gi][0]) + _apply_mesh_color(mesh, face_color) + elif gtype == mujoco.mjtGeom.mjGEOM_CYLINDER: + mesh = trimesh.creation.cylinder(radius=model.geom_size[gi][0], height=model.geom_size[gi][1] * 2) + _apply_mesh_color(mesh, face_color) + elif gtype == mujoco.mjtGeom.mjGEOM_CAPSULE: + mesh = trimesh.creation.capsule(radius=model.geom_size[gi][0], height=model.geom_size[gi][1] * 2) + _apply_mesh_color(mesh, face_color) + + if mesh is not None: + transform = np.eye(4) + transform[:3, :3] = rot + transform[:3, 3] = pos + meshes.append((geom_name, mesh, transform)) + return meshes + + +def get_mujoco_to_pinocchio_world_transform(model, data, robot_name: str | None = None) -> np.ndarray: + """Return a correction that maps MuJoCo world poses into Pinocchio world. + + For some MJCFs, MuJoCo includes a fixed worldbody -> root-body transform + that Pinocchio's ``buildModelFromMJCF()`` omits. When that happens, the IK + solver and dataset poses live in a root-free world, while raw MuJoCo + body/site poses are globally shifted. This helper returns the inverse root + transform so callers can strip that offset from MuJoCo-derived meshes, + body frames, and sites before comparing them against Pinocchio / dataset + poses. + """ + import mujoco + + cfg = ROBOT_CONFIGS.get(robot_name, {}) if robot_name else {} + root_body_name = cfg.get("pinocchio_removed_root_body") + if not root_body_name: + return np.eye(4, dtype=np.float32) + + root_body_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, root_body_name) + if root_body_id < 0: + log.warning(f"Configured root body '{root_body_name}' not found in MuJoCo model") + return np.eye(4, dtype=np.float32) + + root_pose = np.eye(4, dtype=np.float32) + root_pose[:3, 3] = data.xpos[root_body_id].astype(np.float32) + root_pose[:3, :3] = data.xmat[root_body_id].reshape(3, 3).astype(np.float32) + return np.linalg.inv(root_pose).astype(np.float32) + + +def _apply_world_transform_to_meshes( + meshes: list[tuple[str, object, np.ndarray]], + world_transform: np.ndarray, +) -> list[tuple[str, object, np.ndarray]]: + """Left-multiply a world-space transform into each mesh pose.""" + transformed_meshes = [] + for name, mesh, transform in meshes: + transformed_meshes.append((name, mesh, (world_transform @ transform).astype(np.float32))) + return transformed_meshes + + +# ── Public API ─────────────────────────────────────────────────────────────── + + +def get_robot_loaders() -> dict[str, callable]: + """Get the robot loader registry. + + Maps robot_name → loader function that returns (meshes, ee_home_pose). + """ + return { + "google_robot": _load_google_robot, + "franka_panda": _load_franka_panda, + "widowx": _load_widowx, + "ur5e": _load_ur5e, + } + + +def extract_gripper_openings(unified_57d: np.ndarray, robot_name: str = "google_robot") -> np.ndarray: + """Extract gripper opening fractions from unified action grasp state. + + Uses fingertip spread (f0-f1 distance) to invert the FK and recover + the scalar gripper opening at each timestep. + + Args: + unified_57d: (T, 57) unified action. + robot_name: Robot identifier for FK lookup table. + + Returns: + (T+1,) array of gripper openings in [0, 1]. + """ + T = unified_57d.shape[0] + grasp = unified_57d[:, 18:33].reshape(T, 5, 3) + all_grasp = np.concatenate([grasp[0:1], grasp], axis=0) + + # Build monotonic inverse lookup from FK (robot-specific) + _gs = np.linspace(0, 1, 10001).astype(np.float32) + try: + if robot_name == "franka_panda": + from cosmos_framework.data.vfm.action.robot_descriptions.franka import franka_fingertip_fk + + _tips = franka_fingertip_fk(_gs) + elif robot_name == "widowx": + from cosmos_framework.data.vfm.action.robot_descriptions.widowx import widowx_fingertip_fk + + _tips = widowx_fingertip_fk(_gs) + elif robot_name == "ur5e": + from cosmos_framework.data.vfm.action.robot_descriptions.umi import _WSG50_MAX_WIDTH, umi_fingertip_fk + + _tips = umi_fingertip_fk(_gs * _WSG50_MAX_WIDTH) + else: + from cosmos_framework.data.vfm.action.robot_descriptions.google_robot import ( + google_robot_fingertip_fk_vectorized, + ) + + _tips = google_robot_fingertip_fk_vectorized(_gs) + _spreads = np.linalg.norm(_tips[:, 0] - _tips[:, 1], axis=1) + except ImportError: + # Fallback: linear approximation + _spreads = _gs * 0.145 + _min_idx = int(np.argmin(_spreads)) + mono_gs = _gs[_min_idx:] + mono_spreads = _spreads[_min_idx:] + + openings = np.zeros(T + 1, dtype=np.float32) + for t in range(T + 1): + f0, f1 = all_grasp[t, 0], all_grasp[t, 1] + spread = np.linalg.norm(f0 - f1) + if spread <= mono_spreads[0]: + openings[t] = 0.0 + elif spread >= mono_spreads[-1]: + openings[t] = 1.0 + else: + openings[t] = float(np.interp(spread, mono_spreads, mono_gs)) + return openings diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py new file mode 100644 index 0000000000000000000000000000000000000000..bca68b09c31447a43a6720b5decbd4fbac762768 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py @@ -0,0 +1,962 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Interactive 3D viewer for robot action datasets. + +Uses the unified 57D action representation: every dataset declares one explicit +raw ``ActionFormat`` (9D/10D/20D/57D), which is converted to +``UnifiedAction(action_57d, mask)`` before rendering. + +**57D layout**: ``[ego(9) | R_wrist(9) | R_fingers(15) | L_wrist(9) | L_fingers(15)]`` + +Dependencies:: + + pip install viser mujoco pin + +Usage: + # Use each dataset's declared raw action format: + uv run python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --share + + # Override the raw action format explicitly: + uv run python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --action-format 57d --share +""" + +from __future__ import annotations + +import argparse +import importlib +import os +import random +import sys +import time as _time +from dataclasses import dataclass, field +from functools import lru_cache +from pathlib import Path +from typing import Any, cast + +import numpy as np +import torch + +from cosmos_framework.utils import log +from cosmos_framework.data.vfm.action.urdf_visualizer.unified_action import ActionFormat + +_REPO_ROOT = str(Path(__file__).resolve().parents[6]) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) + +# ── Dataset Registry ────────────────────────────────────────────────────────── + + +@dataclass +class DatasetEntry: + """Metadata for a dataset available in the viewer.""" + + name: str + robot_name: str + max_finger_width: float + fps: int + pose_convention: str = "backward_framewise" + camera_fov_deg: float = 60.0 + camera_aspect: float = 4 / 3 + dataset_class: str = "" + dataset_kwargs: dict[str, Any] = field(default_factory=dict) + action_format: ActionFormat = ActionFormat.SINGLE_ARM_10D + dual_base_left: np.ndarray | None = None + dual_base_right: np.ndarray | None = None + robot_embodiment_type: str | None = None + to_unified_fn: str | None = None # "module.path:function" for custom action conversion + to_opencv: np.ndarray | dict[str, np.ndarray] | None = None # native EE → OpenCV rotation + + +def _lazycfg_to_entry( + cfg: Any, + *, + robot_name: str = "", + max_finger_width: float = 0.0, + fps: int = 10, + action_format: ActionFormat = ActionFormat.SINGLE_ARM_10D, + camera_fov_deg: float = 60.0, + camera_aspect: float = 4 / 3, + dual_base_left: np.ndarray | None = None, + dual_base_right: np.ndarray | None = None, + robot_embodiment_type: str | None = None, + to_unified_fn: str | None = None, + to_opencv: np.ndarray | dict[str, np.ndarray] | None = None, + viewer_overrides: dict[str, Any] | None = None, +) -> DatasetEntry: + """Build a viewer dataset entry from a v1p2 ``LazyCall(dataset_entry)`` config.""" + + ds_cfg = cfg["dataset"] if isinstance(cfg, dict) else cfg.dataset + target = ds_cfg["_target_"] if isinstance(ds_cfg, dict) else ds_cfg._target_ + dataset_class = target if isinstance(target, str) else f"{target.__module__}.{target.__qualname__}" + ds_items = ds_cfg.items() if isinstance(ds_cfg, dict) else ds_cfg.items() + dataset_kwargs = {key: value for key, value in ds_items if key != "_target_"} + dataset_kwargs["action_normalization"] = None + if viewer_overrides is not None: + dataset_kwargs.update(viewer_overrides) + + pose_convention = str(dataset_kwargs.get("pose_convention", "backward_framewise")) + cfg_dict = cfg if isinstance(cfg, dict) else dict(cfg) + dataset_name = str(cfg_dict.get("name", "unknown")) + return DatasetEntry( + name=dataset_name, + robot_name=robot_name, + max_finger_width=max_finger_width, + fps=fps, + pose_convention=pose_convention, + camera_fov_deg=camera_fov_deg, + camera_aspect=camera_aspect, + dataset_class=dataset_class, + dataset_kwargs=dataset_kwargs, + action_format=action_format, + dual_base_left=dual_base_left, + dual_base_right=dual_base_right, + robot_embodiment_type=robot_embodiment_type, + to_unified_fn=to_unified_fn, + to_opencv=to_opencv, + ) + + +def _build_datasets() -> dict[str, DatasetEntry]: + """Build the viewer dataset registry from release-supported action data configs.""" + + from cosmos_framework.data.vfm.action.urdf_visualizer.action_datasets import ( + DATASET_AV_480, + DATASET_BRIDGE_480, + DATASET_DROID_480, + DATASET_FRACTAL_256, + DATASET_ROBOMIND_FRANKA_480, + DATASET_ROBOMIND_FRANKA_DUAL_480, + DATASET_UMI_256, + ) + + raw_action_override = {"action_normalization": None} + + from cosmos_framework.data.vfm.action.bridge_orig_lerobot_dataset import _BRIDGE_TO_OPENCV + from cosmos_framework.data.vfm.action.droid_lerobot_dataset import _DROID_TO_OPENCV + from cosmos_framework.data.vfm.action.fractal import _GOOGLE_ROBOT_TO_OPENCV + from cosmos_framework.data.vfm.action.robomind_franka_dataset import _ROBOMIND_FRANKA_TO_OPENCV + + _FRANKA_TO_OPENCV = _ROBOMIND_FRANKA_TO_OPENCV[:3, :3] + + return { + "av": _lazycfg_to_entry( + DATASET_AV_480, + robot_name="", + max_finger_width=0.0, + fps=10, + action_format=ActionFormat.EGO_9D, + viewer_overrides=raw_action_override, + ), + "fractal": _lazycfg_to_entry( + DATASET_FRACTAL_256, + robot_name="google_robot", + max_finger_width=0.05, + fps=3, + action_format=ActionFormat.SINGLE_ARM_10D, + camera_fov_deg=69.0, + camera_aspect=320 / 256, + to_opencv=_GOOGLE_ROBOT_TO_OPENCV, + viewer_overrides=raw_action_override, + ), + "bridge": _lazycfg_to_entry( + DATASET_BRIDGE_480, + robot_name="widowx", + max_finger_width=0.06, + fps=5, + action_format=ActionFormat.SINGLE_ARM_10D, + to_opencv=_BRIDGE_TO_OPENCV, + viewer_overrides=raw_action_override, + ), + "droid": _lazycfg_to_entry( + DATASET_DROID_480, + robot_name="franka_panda", + max_finger_width=0.08, + fps=15, + action_format=ActionFormat.SINGLE_ARM_10D, + to_opencv=_DROID_TO_OPENCV, + viewer_overrides=raw_action_override, + ), + "umi": _lazycfg_to_entry( + DATASET_UMI_256, + robot_name="", + max_finger_width=0.0, + fps=20, + action_format=ActionFormat.SINGLE_ARM_10D, + viewer_overrides=raw_action_override, + ), + "robomind_franka": _lazycfg_to_entry( + DATASET_ROBOMIND_FRANKA_480, + robot_name="franka_panda", + max_finger_width=0.08, + fps=10, + action_format=ActionFormat.SINGLE_ARM_10D, + to_opencv=_FRANKA_TO_OPENCV, + viewer_overrides=raw_action_override, + ), + "robomind_franka_dual": _lazycfg_to_entry( + DATASET_ROBOMIND_FRANKA_DUAL_480, + robot_name="franka_panda", + max_finger_width=0.08, + fps=10, + action_format=ActionFormat.DUAL_ARM_20D, + dual_base_left=np.array( + [[1, 0, 0, 0.0], [0, 1, 0, 0.3], [0, 0, 1, 0.0], [0, 0, 0, 1.0]], + dtype=np.float32, + ), + dual_base_right=np.array( + [[1, 0, 0, 0.0], [0, 1, 0, -0.3], [0, 0, 1, 0.0], [0, 0, 0, 1.0]], + dtype=np.float32, + ), + to_opencv=_FRANKA_TO_OPENCV, + viewer_overrides=raw_action_override, + ), + } + + +DATASETS: dict[str, DatasetEntry] = {} + + +# ── Dataset Creation ────────────────────────────────────────────────────────── + + +def _create_dataset(entry: DatasetEntry, chunk_length: int): + """Instantiate a dataset class for the given entry.""" + import importlib + import inspect + + module_path, class_name = entry.dataset_class.rsplit(".", 1) + mod = importlib.import_module(module_path) + cls = getattr(mod, class_name) + + kwargs = dict(entry.dataset_kwargs) + kwargs["chunk_length"] = chunk_length + kwargs["split"] = "full" + kwargs["mode"] = "policy" + kwargs["enable_fast_init"] = True + + # UMI: factory function + if callable(cls) and not inspect.isclass(cls): + _OMEGACONF_BLOCKLIST = {"chunk_length", "split", "action_normalization", "enable_fast_init"} + kwargs = {k: v for k, v in kwargs.items() if k not in _OMEGACONF_BLOCKLIST} + kwargs["eager_load"] = True + return cls(**kwargs) + + sig = inspect.signature(cls.__init__) + valid_params = set(sig.parameters.keys()) - {"self"} + has_var_keyword = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()) + if not has_var_keyword: + kwargs = {k: v for k, v in kwargs.items() if k in valid_params} + + dataset = cls(**kwargs) + + if hasattr(dataset, "_register_sources"): + dataset._register_sources() + if hasattr(dataset, "__len__") and len(dataset) == 0: + raise RuntimeError(f"{entry.name}: registered sources but found no valid samples") + + from torch.utils.data import IterableDataset as _IterableBase + + if isinstance(dataset, _IterableBase): + dataset = _IterableToMapDataset(dataset) + + return dataset + + +def _resolve_action_format( + entry: DatasetEntry, + action_format_override: ActionFormat | None, +) -> ActionFormat: + """Return the explicit raw action format for one dataset load.""" + return action_format_override if action_format_override is not None else entry.action_format + + +@lru_cache(maxsize=None) +def _load_symbol(target: str): + """Import and cache one ``module:symbol`` reference.""" + + module_name, symbol_name = target.split(":", maxsplit=1) + module = importlib.import_module(module_name) + return getattr(module, symbol_name) + + +def _format_sample_text(value: Any, max_chars: int | None = None) -> str: + """Format optional sample text for the viewer info panel.""" + + if value is None: + return "" + text = value if isinstance(value, str) else str(value) + if not text: + return "" + if max_chars is None: + return text + return text[:max_chars] + + +def _build_viewer_idle_action_spec(action_format: ActionFormat) -> Any: + """Build a fallback idle-frame spec from the viewer-declared action format.""" + + from cosmos_framework.data.vfm.action.action_spec import Gripper, Pos, Rot, build_action_spec + + if action_format is ActionFormat.EGO_9D: + return build_action_spec(Pos(prefix="ego"), Rot("rot6d", prefix="ego")) + if action_format is ActionFormat.SINGLE_ARM_10D: + return build_action_spec(Pos(), Rot("rot6d"), Gripper()) + if action_format is ActionFormat.DUAL_ARM_20D: + return build_action_spec( + Pos(prefix="left"), + Rot("rot6d", prefix="left"), + Gripper(prefix="left"), + Pos(prefix="right"), + Rot("rot6d", prefix="right"), + Gripper(prefix="right"), + ) + if action_format is ActionFormat.UNIFIED_57D: + return build_action_spec( + Pos(prefix="ego"), + Rot("rot6d", prefix="ego"), + Pos(prefix="right_wrist"), + Rot("rot6d", prefix="right_wrist"), + Pos(dim=15, prefix="right_fingers"), + Pos(prefix="left_wrist"), + Rot("rot6d", prefix="left_wrist"), + Pos(dim=15, prefix="left_fingers"), + ) + raise ValueError(f"Unsupported action format for idle-frame detection: {action_format}") + + +def _compute_viewer_idle_frames( + action: Any, + dataset: Any, + action_format: ActionFormat, +) -> torch.Tensor | None: + """Compute idle frames for a viewer sample when the dataset did not provide them.""" + + action_spec = getattr(dataset, "action_spec", None) + compute_idle_frames_method = getattr(dataset, "_compute_idle_frames", None) + if action_spec is not None and compute_idle_frames_method is not None: + return compute_idle_frames_method(action) + + from cosmos_framework.data.vfm.action.pose_utils import compute_idle_frames + + spec = _build_viewer_idle_action_spec(action_format) + try: + idle_frames = compute_idle_frames(action, spec) + except (TypeError, ValueError) as error: + log.warning(f"Viewer idle-frame detection skipped for {action_format.value}: {error}") + return None + return torch.tensor(idle_frames, dtype=torch.long) # [] + + +@lru_cache(maxsize=1) +def _get_viewer_idle_frames_augmentor() -> Any: + """Return the caption augmentor used by the viewer idle-frame path.""" + + from cosmos_framework.data.vfm.augmentors.idle_frames_text_info import IdleFramesTextInfo + + return IdleFramesTextInfo( + input_keys=["ai_caption", "idle_frames", "action"], + output_keys=["ai_caption"], + args={ + "caption_key": "ai_caption", + "idle_frames_key": "idle_frames", + "action_key": "action", + "dropout_rate": 0.0, + "enabled": True, + }, + ) + + +def _enable_viewer_idle_frames(sample: dict[str, Any], dataset: Any, action_format: ActionFormat) -> dict[str, Any]: + """Populate idle-frame metadata and append text in the direct viewer data path.""" + + updated_sample = sample + idle_frames = updated_sample.get("idle_frames") + action = updated_sample.get("action") + if idle_frames is None and action is not None: + idle_frames = _compute_viewer_idle_frames(action, dataset, action_format) + if idle_frames is not None: + updated_sample = dict(updated_sample) + updated_sample["idle_frames"] = idle_frames + + if idle_frames is None: + return updated_sample + + updated_sample = dict(updated_sample) + caption = updated_sample.get("ai_caption") + if isinstance(caption, dict): + updated_sample["ai_caption"] = dict(caption) + + augmented_sample = _get_viewer_idle_frames_augmentor()(updated_sample) + return updated_sample if augmented_sample is None else augmented_sample + + +class _IterableToMapDataset: + """Wraps an IterableDataset into a random-access dataset with lazy loading.""" + + _MAX_CACHE = 200 + + def __init__(self, iterable_dataset, max_samples: int | None = None): + self._iter = iter(iterable_dataset) + self._samples: list[dict] = [] + self._exhausted = False + self._max = max_samples or self._MAX_CACHE + self._ds_name = iterable_dataset.__class__.__name__ + log.info(f"Lazy wrapper: {self._ds_name} (max {self._max})") + + def _fetch_up_to(self, idx: int) -> bool: + while len(self._samples) <= idx and not self._exhausted and len(self._samples) < self._max: + try: + self._samples.append(next(self._iter)) + log.info(f"{self._ds_name}: fetched sample {len(self._samples) - 1}") + except StopIteration: + self._exhausted = True + log.info(f"{self._ds_name}: exhausted at {len(self._samples)}") + return idx < len(self._samples) + + def __len__(self): + return max(len(self._samples), 1) + + def __getitem__(self, idx): + if self._fetch_up_to(idx): + return self._samples[idx] + if self._samples: + return self._samples[idx % len(self._samples)] + raise IndexError(f"{self._ds_name}: no samples available") + + +# ── Viewer ──────────────────────────────────────────────────────────────────── + + +def _collect_scene_points(state) -> np.ndarray: + """Collect all visible trajectory positions for camera fitting.""" + points: list[np.ndarray] = [] + for poses in (state.ego_poses, state.right_poses, state.left_poses): + if poses is not None and len(poses) > 0: + points.append(poses[:, :3, 3].astype(np.float32)) + if not points: + return np.zeros((1, 3), dtype=np.float32) + return np.concatenate(points, axis=0) + + +def _get_observation_up_direction(state, view_forward: np.ndarray) -> np.ndarray: + """Estimate a stable viewer up-direction from the observation camera poses.""" + if state.ego_poses is None or len(state.ego_poses) == 0: + return np.array([0.0, 0.0, 1.0], dtype=np.float32) + + # Ego poses are camera-to-world transforms in OpenCV camera convention, + # where image up corresponds to the negative camera Y axis. + camera_up = -state.ego_poses[:, :3, 1].astype(np.float32) + reference = camera_up[0] + aligned_up = camera_up.copy() + for idx in range(1, len(aligned_up)): + if float(np.dot(aligned_up[idx], reference)) < 0.0: + aligned_up[idx] *= -1.0 + + up_direction = aligned_up.mean(axis=0) + up_direction -= view_forward * float(np.dot(up_direction, view_forward)) + up_norm = float(np.linalg.norm(up_direction)) + if up_norm < 1e-6: + return np.array([0.0, 0.0, 1.0], dtype=np.float32) + return up_direction / up_norm + + +def _get_observation_forward_direction(state) -> np.ndarray | None: + """Estimate a stable viewer forward direction from the observation camera poses.""" + if state.ego_poses is None or len(state.ego_poses) == 0: + return None + + # Ego poses are camera-to-world transforms in OpenCV camera convention, + # where the optical axis points along the positive camera Z axis. + camera_forward = state.ego_poses[:, :3, 2].astype(np.float32) + reference = camera_forward[0] + aligned_forward = camera_forward.copy() + for idx in range(1, len(aligned_forward)): + if float(np.dot(aligned_forward[idx], reference)) < 0.0: + aligned_forward[idx] *= -1.0 + + forward_direction = aligned_forward.mean(axis=0) + forward_norm = float(np.linalg.norm(forward_direction)) + if forward_norm < 1e-6: + return None + return forward_direction / forward_norm + + +def _reset_camera_to_trajectory(client, state, camera_fov_deg: float) -> None: + """Frame one client's viewport using the current trajectory extent.""" + points = _collect_scene_points(state) + center = points.mean(axis=0) + extent = points - center[None, :] + radius = float(np.linalg.norm(extent, axis=1).max()) if len(points) > 0 else 0.0 + radius = max(radius, 0.15) + + fov_rad = float(np.deg2rad(camera_fov_deg)) + fit_distance = radius / max(np.tan(fov_rad / 2.0), 0.35) + view_forward = _get_observation_forward_direction(state) + if view_forward is None: + view_dir = np.array([1.0, -1.0, 0.7], dtype=np.float32) + view_dir /= np.linalg.norm(view_dir) + camera_position = center + view_dir * max(fit_distance * 1.35, 0.5) + view_forward = center - camera_position + view_forward /= np.linalg.norm(view_forward) + else: + camera_position = center - view_forward * max(fit_distance * 1.35, 0.5) + view_forward = center - camera_position + view_forward /= np.linalg.norm(view_forward) + up_direction = _get_observation_up_direction(state, view_forward) + camera = client.camera + + # Camera state arrives asynchronously from the browser. Wait briefly so we can + # update only this client's viewport instead of broadcasting a global reset target. + deadline = _time.time() + 1.0 + while getattr(camera._state, "update_timestamp", 0.0) == 0.0 and _time.time() < deadline: + _time.sleep(0.01) + if getattr(camera._state, "update_timestamp", 0.0) == 0.0: + return + + camera.fov = fov_rad + camera.up_direction = tuple(up_direction.tolist()) + camera.look_at = tuple(center.tolist()) + # Setting position also translates look_at, so restore the target afterwards. + camera.position = tuple(camera_position.tolist()) + camera.look_at = tuple(center.tolist()) + client.flush() + + +def launch_viewer( + port: int = 8013, + share: bool = False, + chunk_length: int = 16, + action_format_override: ActionFormat | None = None, +) -> None: + """Launch the interactive dataset viewer.""" + global DATASETS + import threading as _threading + + import viser + + from cosmos_framework.data.vfm.action.urdf_visualizer.unified_action import ( + build_scene_state, + get_video_from_sample, + to_unified, + ) + from cosmos_framework.data.vfm.action.urdf_visualizer.unified_renderer import UnifiedRenderer + + server = viser.ViserServer(host="0.0.0.0", port=port) + if not DATASETS: + DATASETS = _build_datasets() + datasets = DATASETS + dataset_cache: dict[str, Any] = {} + dataset_locks: dict[str, Any] = {} + dataset_cache_lock = _threading.Lock() + sessions_lock = _threading.Lock() + + def _get_dataset_lock(cache_key: str) -> Any: + """Return the per-dataset lock for a cache key.""" + with dataset_cache_lock: + lock = dataset_locks.get(cache_key) + if lock is None: + lock = _threading.Lock() + dataset_locks[cache_key] = lock + return lock + + @dataclass + class ViewerSession: + client: Any + renderer: Any + time_slider: Any + speed_slider: Any + play_button: Any + action_text: Any + show: dict[str, bool | float] + load_lock: Any = field(default_factory=_threading.Lock) + playing: bool = False + last_frame_time: float = 0.0 + + sessions: dict[int, ViewerSession] = {} + + @server.on_client_connect + def _(client) -> None: + client.scene.reset() + client.scene.set_up_direction("+z") + client.gui.reset() + + renderer = UnifiedRenderer(client) + + with client.gui.add_folder("Dataset"): + ds_dropdown = client.gui.add_dropdown( + "Dataset", options=list(datasets.keys()), initial_value=list(datasets.keys())[0] + ) + ep_input = client.gui.add_number("Episode", initial_value=0, min=0, step=1) + random_button = client.gui.add_button("🎲 Random episode") + status_text = client.gui.add_markdown("*Ready*") + info_text = client.gui.add_markdown("") + + with client.gui.add_folder("Display", expand_by_default=False): + show_robot = client.gui.add_checkbox("Show robot mesh", initial_value=True) + show_frames = client.gui.add_checkbox("Show wrist frames", initial_value=True) + show_traj = client.gui.add_checkbox("Show trajectory", initial_value=True) + show_fingertips = client.gui.add_checkbox("Show fingertips", initial_value=True) + show_ego = client.gui.add_checkbox("Show ego camera", initial_value=True) + axis_scale = client.gui.add_slider("Axis scale", min=0.1, max=20.0, step=0.1, initial_value=1.0) + + robot_frame_toggle_handles: dict[str, Any] = {} + robot_frame_toggle_folder = client.gui.add_folder("Robot Frame Toggles", expand_by_default=False) + with robot_frame_toggle_folder: + robot_frame_toggle_status = client.gui.add_markdown("*Load an episode to choose robot frame coordinates.*") + + with client.gui.add_folder("Playback"): + time_slider = client.gui.add_slider("Time", min=0, max=1, step=1, initial_value=0) + play_button = client.gui.add_button("▶ Play") + speed_slider = client.gui.add_slider("Speed (fps)", min=1, max=30, step=1, initial_value=3) + + cam_panel = client.gui.add_image(np.zeros((64, 64, 3), dtype=np.uint8)) + renderer.set_video_panel(cam_panel) + + with client.gui.add_folder("Action (57D)"): + action_text = client.gui.add_markdown("*No episode loaded*") + + show = { + "frames": True, + "traj": True, + "fingertips": True, + "ego": True, + "robot": True, + "robot_frame_filters": {}, + "axis_scale": 1.0, + } + session = ViewerSession( + client=client, + renderer=renderer, + time_slider=time_slider, + speed_slider=speed_slider, + play_button=play_button, + action_text=action_text, + show=show, + ) + + def _update_action_text(t: int) -> None: + """Update the 57D action display for one client.""" + txt = renderer.format_action_text(t) + action_text.content = txt if txt else "*No data*" + + def _clear_robot_frame_toggles() -> None: + """Remove the dynamic per-frame toggle controls.""" + for handle in robot_frame_toggle_handles.values(): + handle.remove() + robot_frame_toggle_handles.clear() + + def _rebuild_robot_frame_toggles() -> None: + """Rebuild the GUI toggles for the currently loaded robot frames.""" + _clear_robot_frame_toggles() + selectors = renderer.get_robot_frame_selectors() + if not selectors: + show["robot_frame_filters"] = {} + robot_frame_toggle_status.content = "*No robot frame coordinates available for this episode.*" + return + + prev_filters = cast(dict[str, bool], show.get("robot_frame_filters", {})) + show["robot_frame_filters"] = { + selector_key: prev_filters.get(selector_key, False) for selector_key, _ in selectors + } + robot_frame_toggle_status.content = "*Choose which robot frame coordinates to show.*" + + with robot_frame_toggle_folder: + for selector_key, label in selectors: + checkbox = client.gui.add_checkbox( + label, + initial_value=cast(dict[str, bool], show["robot_frame_filters"])[selector_key], + ) + robot_frame_toggle_handles[selector_key] = checkbox + + @checkbox.on_update + def _(_, selector_key: str = selector_key, checkbox: Any = checkbox) -> None: + cast(dict[str, bool], show["robot_frame_filters"])[selector_key] = bool(checkbox.value) + renderer.update(time_slider.value, show) + + def do_load_episode() -> None: + t_start = _time.time() + ds_name = ds_dropdown.value + entry = datasets[ds_name] + effective_action_format = _resolve_action_format(entry, action_format_override) + ep_idx = max(int(ep_input.value), 0) + cache_key = ds_name + + status_text.content = f"⏳ Loading {ds_name} episode {ep_idx}..." + + try: + with _get_dataset_lock(cache_key): + dataset: Any + with dataset_cache_lock: + dataset = cast(Any, dataset_cache.get(cache_key)) + if dataset is None: + status_text.content = f"⏳ Creating {ds_name} dataset..." + dataset = _create_dataset(entry, chunk_length) + with dataset_cache_lock: + dataset_cache[cache_key] = dataset + to_opencv = entry.to_opencv if entry.to_opencv is not None else np.eye(3, dtype=np.float32) + + n_total = len(dataset) + if ep_idx >= n_total: + if isinstance(dataset, _IterableToMapDataset) and not dataset._exhausted: + pass + else: + ep_idx = n_total - 1 + ep_input.value = ep_idx + + sample: Any = _enable_viewer_idle_frames(dataset[ep_idx], dataset, effective_action_format) + + action_tensor = sample["action"] + action_raw = ( + action_tensor.numpy() if isinstance(action_tensor, torch.Tensor) else np.asarray(action_tensor) + ) + + uses_dual_initial_pose = effective_action_format is ActionFormat.DUAL_ARM_20D + + initial_pose_t = sample.get("initial_pose") + if initial_pose_t is None: + initial_pose = np.eye(4, dtype=np.float32) + elif isinstance(initial_pose_t, torch.Tensor): + initial_pose = initial_pose_t.numpy().astype(np.float32) + else: + initial_pose = np.asarray(initial_pose_t, dtype=np.float32) + + initial_pose_right_t = sample.get("initial_pose_right") + initial_pose_left_t = sample.get("initial_pose_left") + initial_pose_right = None + initial_pose_left = None + if initial_pose_right_t is not None: + initial_pose_right = ( + initial_pose_right_t.numpy().astype(np.float32) + if isinstance(initial_pose_right_t, torch.Tensor) + else np.asarray(initial_pose_right_t, dtype=np.float32) + ) + if initial_pose_left_t is not None: + initial_pose_left = ( + initial_pose_left_t.numpy().astype(np.float32) + if isinstance(initial_pose_left_t, torch.Tensor) + else np.asarray(initial_pose_left_t, dtype=np.float32) + ) + if uses_dual_initial_pose: + if initial_pose_left is None: + initial_pose_left = initial_pose + + if entry.to_unified_fn: + converter = _load_symbol(entry.to_unified_fn) + import inspect as _inspect + + params = _inspect.signature(converter).parameters + embodiment_type = entry.robot_embodiment_type or str( + entry.dataset_kwargs.get("embodiment_type", "") + ) + if "embodiment_type" in params: + unified = converter(sample, embodiment_type=embodiment_type) + elif "kind" in params: + unified = converter(action_raw, kind="gripper") + else: + unified = converter(action_raw) + raw_action_label = "custom" + else: + unified = to_unified(action_raw, action_format=effective_action_format) + raw_action_label = effective_action_format.value + state = build_scene_state( + unified, + initial_pose=initial_pose, + initial_pose_right=initial_pose_right, + initial_pose_left=initial_pose_left, + right_base_pose=entry.dual_base_right, + left_base_pose=entry.dual_base_left, + pose_convention=entry.pose_convention, + sample=sample, + ) + state.video = get_video_from_sample(sample) + + # Inject FK joint configs when the dataset provides them (e.g. UR). + jc = sample.get("joint_configs") + if jc is not None: + state.joint_configs = ( + jc.numpy().astype(np.float32) + if isinstance(jc, torch.Tensor) + else np.asarray(jc, dtype=np.float32) + ) + status_text.content = "⏳ Loading robot animation..." + renderer.load(state, entry, to_opencv=to_opencv) + _rebuild_robot_frame_toggles() + _reset_camera_to_trajectory(client, state, entry.camera_fov_deg) + + T = state.T + time_slider.max = max(T, 1) + time_slider.value = 0 + + ai_caption_text = _format_sample_text(sample.get("ai_caption", ""), max_chars=160) + debug_caption_text = _format_sample_text(sample.get("debug_caption", "")) + t_total = _time.time() - t_start + info_text.content = ( + f"**{ds_name.upper()}** — Episode {ep_idx}\n\n" + + (f"Task: {ai_caption_text}\n\n" if ai_caption_text else "") + + (f"Debug: {debug_caption_text}\n\n" if debug_caption_text else "") + + ( + f"Steps: {T} | Raw: {raw_action_label} ({action_raw.shape[-1]}D) → 57D | " + f"Robot: {entry.robot_name or '—'} | FPS: {entry.fps}" + ) + ) + status_text.content = f"✅ Loaded in {t_total:.1f}s" + log.info(f"Loaded {ds_name} ep {ep_idx}: {ai_caption_text[:60]}, {T} steps, {t_total:.1f}s") + + renderer.update(0, show) + renderer.update_axis_scale(axis_scale.value) + _update_action_text(0) + session.last_frame_time = _time.time() + + except Exception as e: + status_text.content = f"❌ Load failed: {e}" + log.error(f"Load failed for client {client.client_id}: {e}") + import traceback + + traceback.print_exc() + + def _do_load_threaded() -> None: + if not session.load_lock.acquire(blocking=False): + return + + def _run() -> None: + try: + do_load_episode() + finally: + session.load_lock.release() + + _threading.Thread(target=_run, daemon=True).start() + + @ds_dropdown.on_update + def _(_) -> None: + _do_load_threaded() + + @ep_input.on_update + def _(_) -> None: + _do_load_threaded() + + @random_button.on_click + def _(_) -> None: + ds_name = ds_dropdown.value + cache_key = ds_name + with _get_dataset_lock(cache_key): + with dataset_cache_lock: + ds = dataset_cache.get(cache_key) + if ds is None: + ep_input.value = 0 + elif isinstance(ds, _IterableToMapDataset): + ep_input.value = len(ds._samples) + else: + ep_input.value = random.randint(0, max(len(ds) - 1, 0)) + _do_load_threaded() + + @time_slider.on_update + def _(_) -> None: + renderer.update(time_slider.value, show) + _update_action_text(time_slider.value) + + @show_robot.on_update + def _(_) -> None: + show["robot"] = show_robot.value + renderer.update(time_slider.value, show) + + @show_frames.on_update + def _(_) -> None: + show["frames"] = show_frames.value + renderer.update(time_slider.value, show) + + @show_traj.on_update + def _(_) -> None: + show["traj"] = show_traj.value + renderer.update(time_slider.value, show) + + @show_fingertips.on_update + def _(_) -> None: + show["fingertips"] = show_fingertips.value + renderer.update(time_slider.value, show) + + @show_ego.on_update + def _(_) -> None: + show["ego"] = show_ego.value + renderer.update(time_slider.value, show) + + @axis_scale.on_update + def _(_) -> None: + show["axis_scale"] = axis_scale.value + renderer.update_axis_scale(axis_scale.value) + + @play_button.on_click + def _(_) -> None: + session.playing = not session.playing + session.last_frame_time = _time.time() + play_button.label = "⏸ Pause" if session.playing else "▶ Play" + + with sessions_lock: + sessions[client.client_id] = session + _do_load_threaded() + + @server.on_client_disconnect + def _(client) -> None: + with sessions_lock: + sessions.pop(client.client_id, None) + + # ── Share URL ── + log.info(f"✅ Viewer ready at http://0.0.0.0:{port}") + if share: + share_url = server.request_share_url() + if share_url: + log.info(f"🌐 Share URL: {share_url}") + try: + with open(os.path.expanduser("~/share_url.txt"), "w") as f: + f.write(share_url + "\n") + except Exception: + pass + + # ── Main Loop ── + try: + while True: + now = _time.time() + with sessions_lock: + active_sessions = list(sessions.values()) + for session in active_sessions: + renderer = session.renderer + if not session.playing or renderer.state is None: + continue + frame_period = 1.0 / max(float(session.speed_slider.value), 1.0) + if now - session.last_frame_time < frame_period: + continue + t = session.time_slider.value + t = (t + 1) % max(renderer.state.T, 1) + session.time_slider.value = t + renderer.update(t, session.show) + txt = renderer.format_action_text(t) + session.action_text.content = txt if txt else "*No data*" + session.last_frame_time = now + _time.sleep(0.02) + except KeyboardInterrupt: + log.info("Shutting down.") + + +def main(): + parser = argparse.ArgumentParser(description="Action dataset viewer (unified 57D)") + parser.add_argument("--port", type=int, default=8013) + parser.add_argument("--share", action="store_true") + parser.add_argument("--chunk-length", type=int, default=16) + parser.add_argument( + "--action-format", + choices=[fmt.value for fmt in ActionFormat], + default=None, + help="Optional override for the dataset-declared raw action format", + ) + args = parser.parse_args() + launch_viewer( + port=args.port, + share=args.share, + chunk_length=args.chunk_length, + action_format_override=ActionFormat(args.action_format) if args.action_format is not None else None, + ) + + +if __name__ == "__main__": + main() diff --git a/cosmos-framework/cosmos_framework/data/vfm/action/viewpoint_utils.py b/cosmos-framework/cosmos_framework/data/vfm/action/viewpoint_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f083e3dfc82caeb4539cfd2baf2014434b0d9fc9 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action/viewpoint_utils.py @@ -0,0 +1,114 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Viewpoint type definitions and caption augmentor for Action datasets. + +Provides a ``Viewpoint`` type alias for camera perspective labels and a +``ViewpointTextInfo`` augmentor that appends a human-readable viewpoint +description to the caption string. +""" + +from __future__ import annotations + +from typing import Literal + +from cosmos_framework.data.imaginaire.webdataset.augmentors.augmentor import Augmentor +from cosmos_framework.utils import log + +Viewpoint = Literal["ego_view", "third_person_view", "wrist_view", "concat_view"] + +DEFAULT_VIEWPOINT_TEMPLATES: dict[str, str] = { + "ego_view": "This video is captured from a first-person perspective looking at the scene.", + "third_person_view": "This video is captured from a third-person perspective looking towards the agent from the front.", + "wrist_view": "This video is captured from a wrist-mounted camera.", + "concat_view": "This video contains concatenated views from multiple camera perspectives.", +} + + +class ViewpointTextInfo(Augmentor): + """Augmentor that appends viewpoint type description to captions. + + Reads a viewpoint label from ``data_dict[viewpoint_key]`` and appends + the corresponding template sentence to the caption. Designed to run + after the raw ``ai_caption`` is set but before duration/FPS metadata + is appended. + + Args: + input_keys: Input keys (kept for API compatibility). + output_keys: Output keys (kept for API compatibility). + args: Configuration arguments: + - caption_key (str): Key for caption in data_dict. Default: ``"ai_caption"`` + - viewpoint_key (str): Key for viewpoint label. Default: ``"viewpoint"`` + - templates (dict): Override mapping from viewpoint to sentence. + Default: :data:`DEFAULT_VIEWPOINT_TEMPLATES` + - separator (str): Separator between caption and metadata. Default: ``". "`` + - enabled (bool): Whether augmentation is enabled. Default: ``True`` + """ + + def __init__( + self, + input_keys: list | None = None, + output_keys: list | None = None, + args: dict | None = None, + ) -> None: + super().__init__(input_keys or [], output_keys or [], args) + + self.caption_key: str = args.get("caption_key", "ai_caption") if args else "ai_caption" + self.viewpoint_key: str = args.get("viewpoint_key", "viewpoint") if args else "viewpoint" + self.templates: dict[str, str] = ( + args.get("templates", DEFAULT_VIEWPOINT_TEMPLATES) if args else DEFAULT_VIEWPOINT_TEMPLATES + ) + self.default_separator: str = args.get("separator", ". ") if args else ". " + self.enabled: bool = args.get("enabled", True) if args else True + + def __call__(self, data_dict: dict) -> dict | None: + """Append viewpoint description to the caption. + + If the sample provides an ``"additional_view_description"`` key (a + free-form string describing the concatenated camera layout), it is + appended after the generic ``concat_view`` template. This allows each + dataset to supply its own description of which cameras are tiled and + how. + + Args: + data_dict: Sample dictionary containing caption and viewpoint. + + Returns: + The mutated *data_dict*, or the original unchanged if the + viewpoint key is missing or unrecognized. + """ + if not self.enabled: + return data_dict + + viewpoint = data_dict.get(self.viewpoint_key) + if viewpoint is None: + raise ValueError( + f"ViewpointTextInfo: missing key {self.viewpoint_key!r} in data_dict. " + f"All action datasets must provide a viewpoint label." + ) + + # Append dataset-specific concat_view details after the base template. + additional_view_description = data_dict.pop("additional_view_description", None) + template = self.templates.get(viewpoint) + + if template is None: + log.warning( + f"ViewpointTextInfo: unrecognized viewpoint {viewpoint!r}. " + f"Known viewpoints: {sorted(self.templates.keys())}. Skipping.", + rank0_only=False, + ) + return data_dict + + if additional_view_description: + separator = " " if template.endswith(".") else self.default_separator + template = template + separator + additional_view_description.rstrip() + + caption = data_dict.get(self.caption_key) + if not isinstance(caption, str) or caption == "": + return data_dict + + caption = caption.rstrip() + separator = " " if caption.endswith(".") else self.default_separator + data_dict[self.caption_key] = caption + separator + template + + return data_dict diff --git a/cosmos-framework/cosmos_framework/data/vfm/action_scripts/__init__.py b/cosmos-framework/cosmos_framework/data/vfm/action_scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosmos-framework/cosmos_framework/data/vfm/action_scripts/memprofile.py b/cosmos-framework/cosmos_framework/data/vfm/action_scripts/memprofile.py new file mode 100644 index 0000000000000000000000000000000000000000..f00338b873d5ed9fbe7840c8e101e408dd181c39 --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/action_scripts/memprofile.py @@ -0,0 +1,254 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""Lightweight CPU memory-profiling helpers. + +Only depends on ``os``, ``sys``, and ``psutil`` so it can be imported safely +from dataset modules without pulling in heavy dependencies. + +Enable per-stage logging by setting the ``MEMORY_PROFILE`` env var:: + + MEMORY_PROFILE=1 torchrun ... +""" + +import contextlib +import gc +import logging +import os +import sys +from collections.abc import Callable, Iterator + +import psutil + +_log = logging.getLogger(__name__) + + +def memprofile_enabled() -> bool: + """Return ``True`` when the ``MEMORY_PROFILE`` env var is truthy.""" + return os.environ.get("MEMORY_PROFILE", "").strip() not in ("", "0", "false") + + +def fmt_mb(mb: float) -> str: + """Format a MiB value as a human-readable string (MiB or GiB).""" + if mb >= 1024: + return f"{mb / 1024:.2f} GiB" + return f"{mb:.1f} MiB" + + +@contextlib.contextmanager +def rss_tracker( + label: str, + *, + enabled: bool | None = None, + extras_fn: Callable[[], list[str]] | None = None, + after_fn: Callable[[], None] | None = None, +) -> Iterator[None]: + """Track RSS delta across a block. No-op when profiling is disabled. + + When *enabled* is ``False`` (or ``None`` and ``MEMORY_PROFILE`` is unset) + the context manager yields immediately with zero overhead -- no + ``gc.collect()`` and no ``psutil`` calls. + + Args: + label: Human-readable description included in the log line. + enabled: Explicit toggle. When ``None``, falls back to + ``memprofile_enabled()`` (i.e. the ``MEMORY_PROFILE`` env var). + extras_fn: Optional callback invoked *after* the measured block. + Each returned string is logged as a supplementary detail line. + after_fn: Optional side-effect callback invoked after logging. + Use for actions that should only run when profiling is active + (e.g. detailed worker memory breakdowns). + """ + if enabled is None: + enabled = memprofile_enabled() + if not enabled: + yield + return + gc.collect() + rss_before = get_rss_mb() + yield + gc.collect() + rss_after = get_rss_mb() + _log.debug( + "[MEMPROFILE] %s | RSS: %s (delta: +%s)", + label, + fmt_mb(rss_after), + fmt_mb(rss_after - rss_before), + ) + if extras_fn is not None: + for line in extras_fn(): + _log.debug("[MEMPROFILE] %s", line) + if after_fn is not None: + after_fn() + + +def get_rss_mb() -> float: + """Return the current process RSS in MiB.""" + return psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024) + + +def get_process_tree_rss_mb() -> float: + """Return RSS of the current process + all children in MiB.""" + proc = psutil.Process(os.getpid()) + total = proc.memory_info().rss + for child in proc.children(recursive=True): + try: + total += child.memory_info().rss + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return total / (1024 * 1024) + + +def get_worker_memory_breakdown() -> list[tuple[int, float]]: + """Return a list of ``(pid, rss_mib)`` for each child process.""" + proc = psutil.Process(os.getpid()) + result: list[tuple[int, float]] = [] + for child in proc.children(recursive=True): + try: + rss_mb = child.memory_info().rss / (1024 * 1024) + result.append((child.pid, rss_mb)) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return result + + +def get_worker_memory_detailed() -> list[dict[str, float]]: + """Return RSS, USS (Unique Set Size), and PSS for each child process. + + USS is the memory *unique* to a process -- not shared with any other. + It directly measures CoW-duplicated pages plus worker-only allocations. + + PSS counts shared pages proportionally (shared_page / num_sharers). + + Returns list of dicts with keys: ``pid``, ``rss``, ``uss``, ``pss`` (all in MiB). + Falls back to RSS-only if ``memory_full_info()`` is unavailable. + """ + proc = psutil.Process(os.getpid()) + result: list[dict[str, float]] = [] + for child in proc.children(recursive=True): + try: + full = child.memory_full_info() + result.append( + { + "pid": float(child.pid), + "rss": full.rss / (1024 * 1024), + "uss": full.uss / (1024 * 1024), + "pss": full.pss / (1024 * 1024), + } + ) + except (psutil.NoSuchProcess, psutil.AccessDenied, AttributeError): + try: + rss_mb = child.memory_info().rss / (1024 * 1024) + result.append( + { + "pid": float(child.pid), + "rss": rss_mb, + "uss": -1.0, + "pss": -1.0, + } + ) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return result + + +def get_uss_mb() -> float: + """Return USS (Unique Set Size) of the current process in MiB. + + Falls back to RSS if ``memory_full_info()`` is unavailable. + """ + proc = psutil.Process(os.getpid()) + try: + return proc.memory_full_info().uss / (1024 * 1024) + except (AttributeError, psutil.AccessDenied): + return proc.memory_info().rss / (1024 * 1024) + + +def log_worker_memory_breakdown(dataset: object) -> None: + """Log a detailed memory breakdown from inside a dataloader worker. + + Designed to be called periodically from ``__getitem__`` when + ``MEMORY_PROFILE=1``. Inspects the dataset's internal state to + report how many ``LeRobotDataset`` instances are loaded, HuggingFace + Arrow table sizes, and the LeRobot ``VideoDecoderCache`` size. + + Args: + dataset: A ``BaseActionLeRobotDataset`` instance (or compatible). + """ + import gc + import logging + + pid = os.getpid() + rss = get_rss_mb() + uss = get_uss_mb() + logger = logging.getLogger(f"memprofile.worker.{pid}") + + logger.warning(f"[WORKER {pid}] RSS={fmt_mb(rss)} USS={fmt_mb(uss)}") + + # --- LeRobotDataset instances --- + datasets_list = getattr(dataset, "_datasets", []) + loaded_count = sum(1 for ds in datasets_list if ds is not None) + total_count = len(datasets_list) + logger.warning(f"[WORKER {pid}] LeRobotDataset: {loaded_count}/{total_count} loaded") + + total_arrow_bytes = 0 + total_hf_rows = 0 + for i, ds in enumerate(datasets_list): + if ds is None: + continue + hf_ds = getattr(ds, "hf_dataset", None) + if hf_ds is None: + logger.warning(f"[WORKER {pid}] ds[{i}]: hf_dataset not yet loaded") + continue + + num_rows = len(hf_ds) + total_hf_rows += num_rows + + arrow_bytes = 0 + data_table = getattr(hf_ds, "_data", None) + if data_table is not None and hasattr(data_table, "nbytes"): + arrow_bytes = data_table.nbytes + total_arrow_bytes += arrow_bytes + + logger.warning(f"[WORKER {pid}] ds[{i}]: rows={num_rows}, arrow={fmt_mb(arrow_bytes / (1024 * 1024))}") + + if loaded_count > 0: + logger.warning( + f"[WORKER {pid}] Total HF rows={total_hf_rows}, total arrow={fmt_mb(total_arrow_bytes / (1024 * 1024))}" + ) + + # --- VideoDecoderCache --- + try: + from lerobot.datasets.video_utils import _default_decoder_cache + + cache_size = _default_decoder_cache.size() + logger.warning(f"[WORKER {pid}] VideoDecoderCache entries: {cache_size}") + except Exception: + pass + + # --- GC stats --- + gc_counts = gc.get_count() + all_objects = len(gc.get_objects()) + logger.warning(f"[WORKER {pid}] GC counts={gc_counts}, tracked objects={all_objects}") + + +def deep_size(obj: object, seen: set | None = None) -> int: + """Approximate deep memory size in bytes for nested Python containers. + + Recursively walks ``dict``, ``list``, ``tuple``, ``set``, and ``frozenset``. + Does **not** follow arbitrary object attributes. + """ + if seen is None: + seen = set() + obj_id = id(obj) + if obj_id in seen: + return 0 + seen.add(obj_id) + size = sys.getsizeof(obj) + if isinstance(obj, dict): + for k, v in obj.items(): + size += deep_size(k, seen) + deep_size(v, seen) + elif isinstance(obj, (list, tuple, set, frozenset)): + for item in obj: + size += deep_size(item, seen) + return size diff --git a/cosmos-framework/cosmos_framework/data/vfm/augmentors/__init__.py b/cosmos-framework/cosmos_framework/data/vfm/augmentors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosmos-framework/cosmos_framework/data/vfm/augmentors/idle_frames_text_info.py b/cosmos-framework/cosmos_framework/data/vfm/augmentors/idle_frames_text_info.py new file mode 100644 index 0000000000000000000000000000000000000000..6ec6a55aade91153c518a51ad10ef69e6fc4d29d --- /dev/null +++ b/cosmos-framework/cosmos_framework/data/vfm/augmentors/idle_frames_text_info.py @@ -0,0 +1,10 @@ +class IdleFramesTextInfo: + """Minimal standalone replacement for viewer caption augmentation.""" + + def __init__(self, input_keys=None, output_keys=None, args=None): + self.input_keys = input_keys or [] + self.output_keys = output_keys or [] + self.args = args or {} + + def __call__(self, sample, *args, **kwargs): + return sample diff --git a/cosmos-framework/cosmos_framework/utils/__init__.py b/cosmos-framework/cosmos_framework/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..503ec1b18d584ba1c349360dedbe6951e3216df6 --- /dev/null +++ b/cosmos-framework/cosmos_framework/utils/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + diff --git a/cosmos-framework/cosmos_framework/utils/easy_io/__init__.py b/cosmos-framework/cosmos_framework/utils/easy_io/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9f47d2e10ce222193f23f79e6cf1c70445dcb233 --- /dev/null +++ b/cosmos-framework/cosmos_framework/utils/easy_io/__init__.py @@ -0,0 +1 @@ +from .easy_io import easy_io diff --git a/cosmos-framework/cosmos_framework/utils/easy_io/easy_io.py b/cosmos-framework/cosmos_framework/utils/easy_io/easy_io.py new file mode 100644 index 0000000000000000000000000000000000000000..a768cb2d2c541856fef8b65a33a19aba2e7ab031 --- /dev/null +++ b/cosmos-framework/cosmos_framework/utils/easy_io/easy_io.py @@ -0,0 +1,40 @@ +# Minimal easy_io shim for the standalone action viewer. +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +class _EasyIO: + def __init__(self) -> None: + self._s3_backend_args: dict[str, Any] = {} + + def set_s3_backend(self, backend_args: dict[str, Any] | None = None) -> None: + self._s3_backend_args = dict(backend_args or {}) + + def _get_s3_client(self): + import boto3, yaml + cred_path = self._s3_backend_args.get("s3_credential_path") + kwargs = {} + if cred_path and Path(cred_path).exists(): + kwargs = yaml.safe_load(Path(cred_path).read_text()) or {} + return boto3.client("s3", **kwargs) + + def get_bytes(self, path: str) -> bytes: + if path.startswith("s3://") or path.startswith("gs://"): + no_scheme = path.split("://", 1)[1] + bucket, key = no_scheme.split("/", 1) + obj = self._get_s3_client().get_object(Bucket=bucket, Key=key) + return obj["Body"].read() + return Path(path).read_bytes() + + def get(self, path: str) -> str: + return self.get_bytes(path).decode() + + def load(self, path: str) -> Any: + text = self.get(path) + if path.endswith(".json"): + return json.loads(text) + return text + +easy_io = _EasyIO() diff --git a/cosmos-framework/cosmos_framework/utils/log.py b/cosmos-framework/cosmos_framework/utils/log.py new file mode 100644 index 0000000000000000000000000000000000000000..7736ef2c34571d12cc611707d98857d26a913720 --- /dev/null +++ b/cosmos-framework/cosmos_framework/utils/log.py @@ -0,0 +1,144 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +import atexit +import os +import sys +from typing import Any + +import torch.distributed as dist +from loguru._logger import Core, Logger + +RANK0_ONLY = True +LEVEL = os.environ.get("LOGURU_LEVEL", "INFO") +RANK = int(os.environ.get("RANK", "0")) + + +def make_new_logger(depth: int = 1) -> Logger: + return Logger( + core=Core(), + exception=None, + depth=depth, + record=False, + lazy=False, + colors=False, + raw=False, + capture=True, + patchers=[], + extra={}, + ) + + +logger = make_new_logger(depth=1) +atexit.register(logger.remove) + + +def _add_relative_path(record: dict[str, Any]) -> None: + try: + start = os.getcwd() + record["extra"]["relative_path"] = os.path.relpath(record["file"].path, start) + except OSError: + # CWD may have been removed (e.g. on some ranks in distributed jobs). + # Fall back to the absolute path so logging still works. + record["extra"]["relative_path"] = f":{record['file'].path}" + + +*options, _, extra = logger._options # type: ignore +logger._options = tuple([*options, [_add_relative_path], extra]) # type: ignore + + +def init_loguru_stdout() -> None: + logger.remove() + datetime_format = get_datetime_format() + machine_format = get_machine_format() + message_format = get_message_format() + logger.add( + sys.stdout, + level=LEVEL, + format=f"{datetime_format}{machine_format}{message_format}", + filter=_rank0_only_filter, + ) + + +def init_loguru_file(path: str) -> None: + datetime_format = get_datetime_format() + machine_format = get_machine_format() + message_format = get_message_format() + logger.add( + path, + encoding="utf8", + level=LEVEL, + format=f"{datetime_format}{machine_format}{message_format}", + rotation="100 MB", + filter=lambda result: _rank0_only_filter(result) or not RANK0_ONLY, + enqueue=True, + ) + + +def get_datetime_format() -> str: + return "[{time:MM-DD HH:mm:ss}|" + + +def get_machine_format() -> str: + node_id = os.environ.get("NGC_ARRAY_INDEX", "0") + num_nodes = int(os.environ.get("NGC_ARRAY_SIZE", "1")) + machine_format = "" + rank = 0 + if dist.is_available(): + if not RANK0_ONLY and dist.is_initialized(): + rank = dist.get_rank() + world_size = dist.get_world_size() + machine_format = ( + f"[Node{node_id:<3}/{num_nodes:<3}][RANK{rank:<5}/{world_size:<5}]" + "[{process.name:<8}]| " + ) + return machine_format + + +def get_message_format() -> str: + message_format = "{level}|{extra[relative_path]}:{line}:{function}] {message}" + return message_format + + +def _rank0_only_filter(record: Any) -> bool: + is_rank0 = record["extra"].get("rank0_only", True) + if RANK == 0 and is_rank0: + return True + if not is_rank0: + record["message"] = f"[RANK {RANK}] " + record["message"] + return not is_rank0 + + +def trace(message: str, rank0_only: bool = True) -> None: + logger.opt(depth=1).bind(rank0_only=rank0_only).trace(message) + + +def debug(message: str, rank0_only: bool = True) -> None: + logger.opt(depth=1).bind(rank0_only=rank0_only).debug(message) + + +def info(message: str, rank0_only: bool = True) -> None: + logger.opt(depth=1).bind(rank0_only=rank0_only).info(message) + + +def success(message: str, rank0_only: bool = True) -> None: + logger.opt(depth=1).bind(rank0_only=rank0_only).success(message) + + +def warning(message: str, rank0_only: bool = True) -> None: + logger.opt(depth=1).bind(rank0_only=rank0_only).warning(message) + + +def error(message: str, rank0_only: bool = True) -> None: + logger.opt(depth=1).bind(rank0_only=rank0_only).error(message) + + +def critical(message: str, rank0_only: bool = True) -> None: + logger.opt(depth=1).bind(rank0_only=rank0_only).critical(message) + + +def exception(message: str, rank0_only: bool = True) -> None: + logger.opt(depth=1).bind(rank0_only=rank0_only).exception(message) + + +# Execute at import time. +init_loguru_stdout() diff --git a/cosmos-framework/pyproject.toml b/cosmos-framework/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..9c73e5ef29e1ec4ecab3346adeb3d533098093a7 --- /dev/null +++ b/cosmos-framework/pyproject.toml @@ -0,0 +1,357 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "cosmos-framework" +version = "1.2.2" +authors = [ + {name = "NVIDIA Corporation"}, +] +description = "Cosmos-Framework: Cosmos3 World Foundation Model (WFM)" +requires-python = ">=3.10" +license = {text = "OpenMDW-1.1"} +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: GPU :: NVIDIA CUDA", + "Intended Audience :: Science/Research", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +dependencies = [ + "accelerate", + "av", + "cattrs", + "diffusers", + "einops", + "hydra-core", + "imageio-ffmpeg", + "imageio", + "loguru", + "msgpack", + "nvidia-cudnn-frontend", + "nvidia-ml-py", + "obstore", + "omegaconf", + "pydantic", + "requests", + "scipy", + "termcolor", + "transformers>=4.57.1,<5.0.0", + "tyro", + "uv", + "websockets", +] + +[project.optional-dependencies] +guardrail = [ + "better-profanity", + "nltk", + "protobuf", + "retinaface-py", + "sentencepiece", +] +interactive = [ +] +serve = [ + "fastapi", + "httpx", + "gradio", + "ray[serve]", +] +train = [ + "aioboto3", + "aiofiles", + "aiohttp", + "arrgh", + "blobfile", + "boto3", + "botocore", + "datasets", + "diffusers-cosmos3", + "dists-pytorch", + "einx", + "fastparquet", + "flask", + "flopth", + "ftfy", + "futureproof", + "fvcore", + "glfw", + "h5py", + "iopath>=0.1.10", + "imagecodecs", + "ipycanvas", + "ipyevents", + "jupyter-compare-view", + "jupyterlab", + "kornia", + "lerobot", + "lpips", + "lz4", + "matplotlib", + "mediapy", + "megatron-core", + "more-itertools", + "moviepy", + "multi-storage-client[boto3,google-cloud-storage,fsspec,observability-otel,vault]==0.44.0", + "ninja", + "nvidia-dali-cuda120", + "open-clip-torch", + "openai", + "opencv-contrib-python", + "packaging", + "pandas", + "parse", + "peft", + "pillow>=12.2.0", + "plyfile", + "polars", + "polyscope", + "psycopg2-binary", + "py3nvml", + "pycocotools", + "pydispatcher", + "pygltflib", + "pyopengl", + "pytest", + "python-memcached", + "pytz", + "pyyaml", + "qwen-vl-utils", + "robotmq", + "rsa", + "s3fs", + "scikit-image", + "semver", + "setuptools", + "slangtorch", + "soundfile", + "tensorstore", + "tiktoken", + "timm", + "torch-fidelity", + "torch-optimizer", + "torchtitan", + "trimesh", + "typeguard", + "urllib3", + "wandb", + "webdataset", + "xatlas", + "zarr", +] + + +[dependency-groups] +dev = [ + "hatch", + "pyinstrument", + "pyrefly==0.55.0", + "pytest", + "pytest-custom_exit_code", + "pytest-cov", + "pytest-env", + "pytest-instafail", + "pytest-regressions", + "pytest-xdist", + "ruff==0.12.7", + "uv", +] +vllm = [ + "vllm==0.19.1", + "torch==2.10.0", +] +# Match nvcr.io/nvidia/pytorch:25.11-py3: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-26-01.html +cu128 = [ + "flash-attn-3-nv==1.0.3+cu128.torch210; platform_machine == 'x86_64'", + "flash-attn==2.7.4.post1+cu128.torch210; platform_machine == 'x86_64'", + "natten==0.21.6.dev6+cu128.torch210; platform_machine == 'x86_64'", + "torch==2.10.0+cu128", + "torchcodec==0.10.0+cu128", # https://github.com/meta-pytorch/torchcodec/releases + "torchvision==0.25.0+cu128", # https://github.com/pytorch/vision/releases + # Dependencies determined from 'uv pip install --dry-run "torch==2.10.0+cu128" --index-url https://download.pytorch.org/whl' + # Issue: https://github.com/astral-sh/uv/issues/14237 + "nvidia-cublas-cu12==12.8.4.1", + "nvidia-cuda-cupti-cu12==12.8.90", + "nvidia-cuda-nvrtc-cu12==12.8.93", + "nvidia-cuda-runtime-cu12==12.8.90", + "nvidia-cudnn-cu12==9.10.2.21", + "nvidia-cufft-cu12==11.3.3.83", + "nvidia-cufile-cu12==1.13.1.3", + "nvidia-curand-cu12==10.3.9.90", + "nvidia-cusolver-cu12==11.7.3.90", + "nvidia-cusparse-cu12==12.5.8.93", + "nvidia-cusparselt-cu12==0.7.1", + "nvidia-nccl-cu12==2.27.5", + "nvidia-nvjitlink-cu12==12.8.93", + "nvidia-nvshmem-cu12==3.4.5", + "nvidia-nvtx-cu12==12.8.90", +] +cu128-train = [ + {include-group = "cu128"}, + "torchao==0.16.0+cu128; platform_machine == 'x86_64'", # https://github.com/pytorch/ao/issues/2919 + "transformer-engine==2.12.0+cu128.torch210", + "triton==3.6.0", +] +# Match nvcr.io/nvidia/pytorch:25.11-py3: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-11.html +cu130 = [ + "flash-attn-3-nv==1.0.3+cu130.torch210; platform_machine == 'x86_64'", + "flash-attn==2.7.4.post1+cu130.torch210; platform_machine == 'x86_64'", + "natten==0.21.6.dev6+cu130.torch210", + "torch==2.10.0+cu130", + "torchcodec==0.10.0+cu130", # https://github.com/meta-pytorch/torchcodec/releases + "torchvision==0.25.0+cu130", # https://github.com/pytorch/vision/releases + # Dependencies determined from 'uv pip install --dry-run "torch==2.10.0+cu130" --index-url https://download.pytorch.org/whl' + # Issue: https://github.com/astral-sh/uv/issues/14237 + "nvidia-cublas==13.1.0.3", + "nvidia-cuda-cupti==13.0.85", + "nvidia-cuda-nvrtc==13.0.88", + "nvidia-cuda-runtime==13.0.96", + "nvidia-cudnn-cu13==9.15.1.9", + "nvidia-cufft==12.0.0.61", + "nvidia-cufile==1.15.1.6", + "nvidia-curand==10.4.0.35", + "nvidia-cusolver==12.0.4.66", + "nvidia-cusparse==12.6.3.3", + "nvidia-cusparselt-cu13==0.8.0", + "nvidia-nccl-cu13==2.28.9", + "nvidia-nvjitlink==13.0.88", + "nvidia-nvshmem-cu13==3.4.5", + "nvidia-nvtx==13.0.85", +] +cu130-train = [ + {include-group = "cu130"}, + "torchao==0.16.0+cu130; platform_machine == 'x86_64'", # https://github.com/pytorch/ao/issues/2919 + "transformer-engine==2.12.0+cu130.torch210", + "triton==3.6.0", +] +# LIBERO simulator dependencies for the closed-loop eval client. +# Mirrors packages/cosmos-policy +# `libero` group; `libero` from PyPI declares `robosuite` transitively so we +# don't need to list it here. The numpy upper bound that LIBERO/robosuite/numba +# needs is enforced via `[tool.uv].override-dependencies` (cannot be group-scoped +# because uv overrides are global). +libero = [ + "bddl", + "cloudpickle", + "draccus", + "easydict", + "gym", + "imageio[ffmpeg]", + "libero", + "mujoco==3.3.2", +] +# WebSocket policy server (closed-loop robot policy eval via openpi protocol). +# See docs/action_policy_robolab_websocket.md. +policy-server = [ + "filelock>=3.27.0", + "openpi-server", +] + +[project.readme] +content-type = "text/markdown" +text = ''' +# Cosmos3-VFM + +[Documentation](https://github.com/NVIDIA/cosmos-framework/blob/main/README.md) +''' + +[project.urls] +documentation = "https://github.com/NVIDIA/cosmos-framework/blob/main/README.md" +homepage = "https://research.nvidia.com/labs/dir/cosmos3" +issues = "https://github.com/NVIDIA/cosmos-framework/issues" +repository = "https://github.com/NVIDIA/cosmos-framework" + +[tool.uv] +required-version = ">=0.11.3" +conflicts = [ + [ + {group = "vllm"}, + {group = "cu128"}, + {group = "cu128-train"}, + {group = "cu130"}, + {group = "cu130-train"}, + ], +] +override-dependencies = [ + # Lower bound defeats robomimic/robosuite's old numpy 1.x pins; upper bound + # is required by numba (transitive of robosuite via the `libero` group), whose + # latest release supports numpy <2.3. uv overrides are global, so this also + # applies when `--group=libero` is not active — acceptable because no other + # group currently needs numpy 2.3+. + "numpy>=2.0.0,<2.3", + "lightning; sys_platform == 'never'", # PyPi quarantined: https://pypi.org/project/lightning/ + "pynvml; sys_platform == 'never'", +] +required-environments = [ + "sys_platform == 'linux' and platform_machine == 'x86_64'", + "sys_platform == 'linux' and platform_machine == 'aarch64'", +] + +[tool.uv.sources] +diffusers-cosmos3 = { path = "packages/diffusers-cosmos3", editable = true } +flash-attn = { index = "cosmos"} +flash-attn-3-nv = { index = "cosmos" } +lerobot = { git = "https://github.com/mli0603/lerobot.git" } +megatron-core = { git = "https://github.com/NVIDIA/Megatron-LM.git", rev = "de56227" } +natten = { index = "cosmos"} +torch = { index = "pytorch"} +torchao = { index = "pytorch"} +torchcodec = { index = "pytorch" } +torchvision = { index = "pytorch"} +transformer-engine = { index = "cosmos"} +triton = { index = "pytorch"} + +[[tool.uv.index]] +name = "cosmos" +url = "https://nvidia-cosmos.github.io/cosmos-dependencies/v1.5.0" +explicit = true + +[[tool.uv.index]] +name = "natten" +url = "https://whl.natten.org" +explicit = true +format = "flat" + +[[tool.uv.index]] +name = "pytorch" +url = "https://download.pytorch.org/whl" +explicit = true + +[tool.uv.audit] +ignore = [ + # diskcache: No known fix + "GHSA-w8v5-vhqr-4h9v", + # transformers: Upgrade to >=5 + "GHSA-69w3-r845-3855", + "PYSEC-2025-217", + # flash-attn: flash-attn deserialization isn't used in cosmos3 package + "GHSA-7g5w-pq96-8c5w", + # Need diffusers 0.38 + "GHSA-7wx4-6vff-v64p", + "GHSA-98h9-4798-4q5v", + "PYSEC-2026-41", + # torch, no fix + "PYSEC-2026-139", + # Need vllm 0.20 + "GHSA-83vm-p52w-f9pw", + "GHSA-hpv8-x276-m59f" +] + +[tool.hatch.build.targets.sdist] +packages = [ + "cosmos_framework", +] + +[tool.hatch.build.targets.wheel] +packages = [ + "cosmos_framework", +] +exclude = [ + "*_test.py", +] diff --git a/cosmos-framework/sitecustomize.py b/cosmos-framework/sitecustomize.py new file mode 100644 index 0000000000000000000000000000000000000000..246bf6a40e3204926365f114e33c99f316d4e5e7 --- /dev/null +++ b/cosmos-framework/sitecustomize.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Runtime load tracer. +# +# Auto-imported by every Python process started with PYTHONPATH=. +# When LOAD_TRACE_DIR is set, registers an atexit hook that walks +# sys.modules at shutdown and writes the file paths (filtered to those +# under LOAD_TRACE_ROOT) into {LOAD_TRACE_DIR}/{LOAD_TRACE_TAG}_pid{PID}.txt. +# +# Used to inventory which released files are actually touched by each +# end-to-end smoke. Union the per-experiment traces, diff against the full +# .py list, and the residual is dead code (relative to that smoke set). +import atexit +import os +import sys + +_DIR = os.environ.get("LOAD_TRACE_DIR", "") +if _DIR: + _TAG = os.environ.get("LOAD_TRACE_TAG", "default") + _ROOT = os.path.realpath(os.environ.get("LOAD_TRACE_ROOT", os.getcwd())) + + os.makedirs(_DIR, exist_ok=True) + + def _dump(): + seen = set() + for mod in list(sys.modules.values()): + f = getattr(mod, "__file__", None) + if not f: + continue + try: + rp = os.path.realpath(f) + except OSError: + continue + if rp.startswith(_ROOT): + seen.add(rp) + path = os.path.join(_DIR, f"{_TAG}_pid{os.getpid()}.txt") + try: + with open(path, "w") as h: + for p in sorted(seen): + h.write(p + "\n") + except OSError: + pass + + atexit.register(_dump) diff --git a/start.sh b/start.sh new file mode 100755 index 0000000000000000000000000000000000000000..374687f1a83d78f5c589648d763293d9b86c66bd --- /dev/null +++ b/start.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +set -euo pipefail + +export PORT="${PORT:-7860}" +export PYTHONUNBUFFERED=1 +export PYTHONPATH="${PYTHONPATH:-/app/cosmos-framework}" +export HF_HOME="${HF_HOME:-/tmp/.cache/huggingface}" +export XDG_CACHE_HOME="${XDG_CACHE_HOME:-/tmp/.cache}" +export MPLCONFIGDIR="${MPLCONFIGDIR:-/tmp/matplotlib}" +export MUJOCO_GL="${MUJOCO_GL:-osmesa}" +export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}" + +# Default to the packaged Bridge demo subset. No HF download/streaming is needed. +export COSMOS_VIEWER_ON_DEMAND_VIDEO="${COSMOS_VIEWER_ON_DEMAND_VIDEO:-0}" +export COSMOS_VIEWER_DOWNLOAD_DATA="${COSMOS_VIEWER_DOWNLOAD_DATA:-0}" +export BRIDGE_LEROBOT_ROOT="${BRIDGE_LEROBOT_ROOT:-/app/assets/examples/bridge_lerobot_v3}" +export AV_ROOT="${AV_ROOT:-/app/assets/examples/av_v2_03292026_wdinfo}" +export UMI_ROOT="${UMI_ROOT:-/app/assets/examples/fastumi/fastumi_single_arm/pour_coke}" +export FRACTAL_ROOT="${FRACTAL_ROOT:-/app/assets/examples/fractal20220817_data}" +export DROID_ROOT="${DROID_ROOT:-/app/assets/examples/droid_plus_lerobot_640x360_20260412}" +export ROBOMIND_FRANKA_ROOT="${ROBOMIND_FRANKA_ROOT:-/app/assets/examples/RoboMIND_20251228/benchmark1_0_release/franka_3rgb/241021_close_trash_bin_1}" +export ROBOMIND_FRANKA_DUAL_ROOT="${ROBOMIND_FRANKA_DUAL_ROOT:-/app/assets/examples/RoboMIND_20251228/benchmark1_1_release/franka_fr3_dual/both_pour_water}" +export ROBOMIND_ROOT="${ROBOMIND_ROOT:-/app/assets/examples/RoboMIND_20251228}" + +mkdir -p "${HF_HOME}" "${XDG_CACHE_HOME}" + +echo "Starting Cosmos3 action viewer on port ${PORT}" +echo "BRIDGE_LEROBOT_ROOT=${BRIDGE_LEROBOT_ROOT}" +echo "COSMOS_VIEWER_ON_DEMAND_VIDEO=${COSMOS_VIEWER_ON_DEMAND_VIDEO}" +echo "AV_ROOT=${AV_ROOT}" +echo "UMI_ROOT=${UMI_ROOT}" +echo "FRACTAL_ROOT=${FRACTAL_ROOT}" +echo "DROID_ROOT=${DROID_ROOT}" +echo "ROBOMIND_FRANKA_ROOT=${ROBOMIND_FRANKA_ROOT}" +echo "ROBOMIND_FRANKA_DUAL_ROOT=${ROBOMIND_FRANKA_DUAL_ROOT}" + +cd /app/cosmos-framework +python cosmos_framework/data/vfm/action/urdf_visualizer/viewer.py --port "${PORT}"