NV-MedTech
Safetensors
Gr00tN1d6
nigeln commited on
Commit
47d8583
·
0 Parent(s):

Squash history into single release commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
EXPLAINABILITY.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # **Explainability**
2
+
3
+ |Field:|Response:|
4
+ |:---:|:---:|
5
+ |Intended Domain:| Open healthcare foundation model for generalized surgical and ultrasound robotics reasoning and skills.|
6
+ |Model Type: |Robot VLA|
7
+ |Intended Users:|Researchers and developers working on surgical robotics and ultrasound applications.|
8
+ |Output:|The model outputs are actions, and the units are floating-point values. This is referred to as "robot action policy." Actions consist of continuous-value vectors that correspond to different motor controls on a robot.|
9
+ |Describe how the model works:|Accepts multimodal inputs such as video, ultrasound, proprioception, and language, then outputs a robot action policy.|
10
+ |Technical Limitations & Mitigation:| This model is not tested or intended for use in mission critical or clinical applications that require functional safety. Use in those applications is at the user's own risk and sole responsibility, including taking the necessary steps to add needed guardrails or safety mechanisms prior to deployment. More generally, limitations include, but are not limited to:<br>- The model may underperform in operating room environments or device configurations that differ from the training distribution.<br>- Coverage may be limited for rare procedures, uncommon instruments, specialized workflows, or underrepresented institutions.<br>- Performance can vary across sites, sensors, and embodiment interfaces; additional fine-tuning and validation may be required for new deployments.<br><br>Risks and possible mitigations include:<br>Risk: Model underperformance in variable operating room conditions, device configurations, or imaging settings.<br>Mitigation: Expand data coverage across devices/settings and fine-tune for target environments.<br><br>Risk: Integration challenges across surgical/ultrasound platforms with different control interfaces or sensing configurations.<br>Mitigation: Provide embodiment-specific integration guidance and validation procedures.<br><br>Risk: Limited coverage for rare procedures or uncommon instrument/tooling setups.<br>Mitigation: Curate targeted data and evaluate on representative task subsets.|
11
+ |Verified to have met prescribed quality standards?|Yes|
12
+ |Performance Metrics:|Success rate, as well as the following:<br>1) if the trajectory is smooth and does not jitter<br>2) if the robot does not hit any other objects<br>3) if the trajectory is natural|
13
+ |Potential Known Risks:|This model is not tested or intended for clinical applications that require functional safety. The use of the model in those applications is at the user's own risk and sole responsibility, including taking the necessary steps to add needed guardrails or safety mechanisms prior to deployment.|
14
+ |End User License Agreement:| Your use of this model is governed by the [NSCL V1 License](https://developer.download.nvidia.com/licenses/NVIDIA-OneWay-Noncommercial-License-22Mar2022.pdf).|
LICENSE ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NVIDIA License
2
+ 1. Definitions
3
+ “Licensor” means any person or entity that distributes its Work.
4
+ “Work” means (a) the original work of authorship made available under this license,
5
+ which may include software, documentation, or other files, and (b) any additions to or
6
+ derivative works thereof that are made available under this license.
7
+ The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the
8
+ meaning as provided under U.S. copyright law; provided, however, that for the purposes
9
+ of this license, derivative works shall not include works that remain separable from, or
10
+ merely link (or bind by name) to the interfaces of, the Work.
11
+ Works are “made available” under this license by including in or with the Work either (a)
12
+ a copyright notice referencing the applicability of this license to the Work, or (b) a copy
13
+ of this license.
14
+ 2. License Grant
15
+ 2.1 Copyright Grant. Subject to the terms and conditions of this license, each
16
+ Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free,
17
+ copyright license to use, reproduce, prepare derivative works of, publicly display,
18
+ publicly perform, sublicense and distribute its Work and any resulting derivative
19
+ works in any form.
20
+ 3. Limitations
21
+ 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so
22
+ under this license, (b) you include a complete copy of this license with your
23
+ distribution, and (c) you retain without modification any copyright, patent,
24
+ trademark, or attribution notices that are present in the Work.
25
+ 3.2 Derivative Works. You may specify that additional or different terms apply to
26
+ the use, reproduction, and distribution of your derivative works of the Work (“Your
27
+ Terms”) only if (a) Your Terms provide that the use limitation in Section 3.3
28
+ applies to your derivative works, and (b) you identify the specific derivative works
29
+ that are subject to Your Terms. Notwithstanding Your Terms, this license (including
30
+ the redistribution requirements in Section 3.1) will continue to apply to the Work
31
+ itself.
32
+ 3.3 Use Limitation. The Work and any derivative works thereof only may be used
33
+ or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA
34
+ Corporation and its affiliates may use the Work and any derivative works
35
+ commercially. As used herein, “non-commercially” means for research or
36
+ evaluation purposes only.
37
+ 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any
38
+ Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce
39
+ any patents that you allege are infringed by any Work, then your rights under this
40
+ license from such Licensor (including the grant in Section 2.1) will terminate
41
+ immediately.
42
+ 3.5 Trademarks. This license does not grant any rights to use any Licensor’s or its
43
+ affiliates’ names, logos, or trademarks, except as necessary to reproduce the
44
+ notices described in this license.
45
+ 3.6 Termination. If you violate any term of this license, then your rights under this
46
+ license (including the grant in Section 2.1) will terminate immediately.
47
+ 4. Disclaimer of Warranty.
48
+ THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
49
+ EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
50
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-
51
+ INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS
52
+ LICENSE.
53
+ 5. Limitation of Liability.
54
+ EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
55
+ THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
56
+ SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
57
+ INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR
58
+ RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT
59
+ NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR
60
+ DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER DAMAGES OR LOSSES),
61
+ EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
PRIVACY.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # **Privacy**
2
+
3
+ |Field:|Response:|
4
+ |:---:|:---:|
5
+ |Generatable or reverse engineerable personal data?|None|
6
+ |Personal data used to create this model?|No|
7
+ |How often is dataset reviewed?|Before Release|
8
+ |Was data from user interactions with the AI model (e.g. user input and prompts) used to train the model?|No|
9
+ |Is there provenance for all datasets used in training?|Yes|
10
+ |Does data labeling (annotation, metadata) comply with privacy laws?|Yes|
11
+ |Is data compliant with data subject requests for data correction or removal, if such a request was made?|Yes|
12
+ |Applicable NVIDIA Privacy Policy|https://www.nvidia.com/en-us/about-nvidia/privacy-policy/|
README.md ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: nvidia-license
4
+ license_link: >-
5
+ https://developer.download.nvidia.com/licenses/NVIDIA-OneWay-Noncommercial-License-22Mar2022.pdf?t=eyJscyI6ImdzZW8iLCJsc2QiOiJodHRwczovL3d3dy5nb29nbGUuY29tLyIsIm5jaWQiOiJzby15b3V0LTg3MTcwMS12dDQ4In0=
6
+ base_model:
7
+ - nvidia/GR00T-N1.6-3B
8
+ datasets:
9
+ - nvidia/PhysicalAI-Robotics-Open-H-Embodiment
10
+ library_name: nv-medtech
11
+ ---
12
+
13
+ <div align="center">
14
+ <a href="https://github.com/NVIDIA-Medtech/GR00T-H">
15
+ <img src="media/gr00t-h.png" alt="GR00T-H" style="max-width:600px; width:100%; height:auto;">
16
+ </a>
17
+ <div align="center">
18
+ <a href="https://github.com/NVIDIA-Medtech/GR00T-H">
19
+ <img src="https://img.shields.io/badge/GitHub-grey?logo=GitHub" alt="GitHub Badge">
20
+ </a>
21
+ <a href="https://huggingface.co/datasets/nvidia/PhysicalAI-Robotics-Open-H-Embodiment">
22
+ <img src="https://img.shields.io/badge/Dataset-Open--H--Embodiment-brightgreen?logo=huggingface" alt="Dataset Badge">
23
+ </a>
24
+ </div>
25
+ </div>
26
+
27
+ # GR00T-H
28
+
29
+ ### Description:
30
+
31
+ GR00T-H is a post-trained variant of NVIDIA Isaac GR00T N1.6 for surgical robotics. It builds on the GR00T N1.6 VLA foundation and adapts it using the Open-H embodiment dataset.
32
+
33
+ This model is for research and development only.
34
+
35
+ The neural network architecture is inherited from the GR00T N series of models, combining a vision-language foundation model with a diffusion transformer head that denoises continuous actions.
36
+
37
+ ### License/Terms of Use:
38
+
39
+ [NVIDIA License](https://developer.download.nvidia.com/licenses/NVIDIA-OneWay-Noncommercial-License-22Mar2022.pdf)
40
+
41
+ You are responsible for ensuring that your use of NVIDIA provided models complies with all applicable laws.
42
+
43
+
44
+ ### Deployment Geography:
45
+
46
+ Global
47
+
48
+ ### Use Case:
49
+
50
+ Researchers and Academics: Healthcare-focused robotics research and algorithm development.
51
+
52
+ ### Intended Use
53
+
54
+ GR00T-H is intended for use in robotics R&D, including exploration of surgical robotics and robotic ultrasound policies, benchmarking, and method development. It is not intended for clinical deployment, patient care, or medical decision-making.
55
+
56
+ ## Reference(s):
57
+
58
+ - Isaac GR00T N1.6: [GR00T-N1.6-3B](https://huggingface.co/nvidia/GR00T-N1.6-3B)
59
+ - GR00T Website: [NVIDIA Isaac GR00T](https://developer.nvidia.com/isaac/gr00t)
60
+ - Eagle VLM: Chen, Guo, et al. "Eagle 2.5: Boosting Long-Context Post-Training for Frontier Vision-Language Models." arXiv:2504.15271 (2025).
61
+ - Liu, Xingchao, and Chengyue Gong. "Flow Straight and Fast: Learning to Generate and Transfer Data with Rectified Flow." The Eleventh International Conference on Learning Representations.
62
+ - Flow Matching Policy: Black, Kevin, et al. "π0: A Vision-Language-Action Flow Model for General Robot Control." arXiv preprint arXiv:2410.24164 (2024).
63
+
64
+
65
+ ## Model Architecture:
66
+
67
+ **Architecture Type:** Vision Transformer, Multilayer Perceptron, Flow matching Transformer
68
+
69
+ **This model was developed based on GR00T N1.6.**
70
+
71
+ **Number of model parameters:** 3B
72
+
73
+ GR00T-H uses vision and text transformers to encode the robot's image observations and text instructions. The architecture handles a varying number of views per embodiment by concatenating image token embeddings from all frames into a sequence, followed by language token embeddings.
74
+
75
+ To model proprioception and a sequence of actions conditioned on observations, GR00T-H uses a flow matching transformer. The flow matching transformer interleaves self-attention over proprioception and actions with cross-attention to the vision and language embeddings. During training, the input actions are corrupted by randomly interpolating between the clean action vector and a Gaussian noise vector. At inference time, the policy first samples a Gaussian noise vector and iteratively reconstructs a continuous-value action using its velocity prediction.
76
+
77
+ In GR00T N1.6, the MLP connector between the vision-language features and the diffusion-transformer (DiT) has been modified for improved performance on our sim benchmarks. Also, it was trained jointly with flow matching and world-modeling objectives.
78
+
79
+ **Network Architecture:**
80
+ ![Network Architecture](https://github.com/NVIDIA/Isaac-GR00T/blob/main/media/model-architecture.png?raw=true)
81
+ The schematic diagram is shown in the illustration above.
82
+ Red, Green, Blue (RGB) camera frames are processed through a pre-trained vision transformer (SigLip2).
83
+ Robot proprioception is encoded using a multi-layer perceptron (MLP) indexed by the embodiment ID. To handle variable-dimension proprio, inputs are padded to a configurable max length before feeding into the MLP.
84
+ Actions are encoded and velocity predictions decoded by an MLP, one per unique embodiment.
85
+ The flow matching transformer is implemented as a diffusion transformer (DiT), in which the diffusion step conditioning is implemented using adaptive layernorm (AdaLN).
86
+
87
+ ## Input(s):
88
+
89
+ **Input Type(s):**
90
+ - Vision: Image Frames
91
+ - State: Robot Proprioception
92
+ - Language Instruction: Text
93
+
94
+ **Input Format(s):**
95
+ - Vision: Variable number of image frames from robot cameras
96
+ - State: Floating Point
97
+ - Language Instruction: String
98
+
99
+ **Input Parameters:**
100
+ - Vision: Two-Dimensional (2D) - Red, Green, Blue (RGB) image, any resolution
101
+ - State: One-Dimensional (1D) - Floating number vector
102
+ - Language Instruction: One-Dimensional (1D) - String
103
+
104
+
105
+ ## Output(s)
106
+
107
+ **Output Type(s):** Actions
108
+
109
+ **Output Format:** Continuous-value vectors
110
+
111
+ **Output Parameters:** Two-Dimensional (2D)
112
+
113
+ **Other Properties Related to Output:** Continuous-value vectors correspond to different motor controls on a robot, which depends on Degrees of Freedom of the robot embodiment.
114
+
115
+ Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions.
116
+
117
+
118
+ ## Software Integration:
119
+
120
+ **Runtime Engine(s):** PyTorch, TensorRT
121
+
122
+ **Supported Hardware Microarchitecture Compatibility:**
123
+ All of the below:
124
+
125
+ - NVIDIA Ampere
126
+ - NVIDIA Blackwell
127
+ - NVIDIA Jetson
128
+ - NVIDIA Hopper
129
+ - NVIDIA Lovelace
130
+
131
+ **Supported Operating System:**
132
+
133
+ - Ubuntu
134
+
135
+ ## Model Version(s):
136
+
137
+ Version 1.0
138
+
139
+ ## Training, Testing, and Evaluation Datasets:
140
+
141
+ ### Dataset Overview
142
+
143
+ - **Total Size:** 601.50 hours (training subset used for GR00T-H post-training)
144
+ - **Total Number of Datasets:** 58
145
+ - **Dataset Partition:** Training 98%, Testing N/A (real-world robot evaluation only), Validation 2%
146
+
147
+ ### Training Data Summary
148
+
149
+ GR00T-H is trained on the Open-H-Embodiment dataset, a community-driven effort to assemble large-scale, multimodal healthcare robotics data for generalist VLA models. The full Open-H-Embodiment dataset contains 778 hours of real and synthetic procedure episodes with synchronized streams such as video, kinematics, force/torque, ultrasound, and domain-specific sensors. For GR00T-H post-training, a curated 601.50-hour subset was used.
150
+
151
+ GR00T-H was trained on 7 different robotic embodiments contained within Open-H, including CMR Versius, dVRK, dVRK-Si, UR5, Rob Surgical Bitrack, Tuodao MA2000, and KUKA.
152
+
153
+ To enable better cross-embodiment transfer, the action space was standardized to absolute end-effector (EEF) positioning. Additionally, camera configurations were standardized to include only (A) a single third-person monocular view, or (B) a third-person monocular view with wrist camera(s) and/or additional modalities (e.g., ultrasound images).
154
+
155
+ The Open-H dataset was collected by 35 institutions across the globe. Data collection took place in various settings, including simulation, benchtop, ex vivo, in vivo, and clinical environments. Depending on the dataset, robots were teleoperated either programmatically or by engineers, researchers, medical students, or professional surgeons.
156
+
157
+ ### Training Dataset
158
+
159
+ - **Data Modality:** Video, Kinematics
160
+ - **Video Training Data Size:** 601.50 hours
161
+ - **Kinematic Training Data Size:** 601.50 hours
162
+ - **Data Collection Method:** Hybrid: Automatic/Sensors, Human, Synthetic
163
+ - **Labeling Method:** Hybrid: Automatic/Sensors, Human, Synthetic
164
+ - **Properties:**
165
+ - Open-H is a healthcare robotics dataset comprised of time-synchronized video and kinematics, as well as text labels describing the task being completed.
166
+
167
+ ### Evaluation Dataset
168
+
169
+ - **Data Collection Method:** Hybrid: Automatic/Sensors, Human, Synthetic
170
+ - **Labeling Method:** Hybrid: Automatic/Sensors, Human, Synthetic
171
+ - **Properties:**
172
+ - 2% of the training dataset was held-out for training-time validation.
173
+ - Primary evaluations are conducted in the real-world without a dataset.
174
+
175
+ ## Inference:
176
+
177
+ **Acceleration Engine:** TensorRT
178
+
179
+ **Test Hardware:** NVIDIA Ampere
180
+
181
+ ## Ethical Considerations:
182
+
183
+ NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
184
+
185
+ Please make sure you have proper rights and permissions for all input image and video content; if an image or video includes people, personal health information, or intellectual property, the image or video generated will not blur or maintain proportions of image subjects included.
186
+
187
+ For more detailed information on ethical considerations for this model, please see the Model Card++ [Explainability](EXPLAINABILITY.md), [Bias](BIAS.md), [Safety & Security](SAFETY_and_SECURITY.md), and [Privacy](PRIVACY.md) Subcards.
188
+
189
+ Please report model quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
SAFETY_and_SECURITY.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # **Safety & Security**
2
+
3
+ |Field:|Response:|
4
+ |:---:|:---:|
5
+ |Model Application(s):|Robotic surgical and ultrasound research and development.|
6
+ |Describe life-critical application (if present):|This model is intended for research and is not tested or intended for clinical or mission critical applications that require functional safety. It is not for deployment in clinical settings or for medical decision-making. Any real-world use requires independent safety review and regulatory clearance.|
7
+ |Use Case Restrictions:| Abide by the [NSCL V1 License](https://developer.download.nvidia.com/licenses/NVIDIA-OneWay-Noncommercial-License-22Mar2022.pdf)|
8
+ |Model and Dataset Restrictions:|The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to.|
config.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": true,
5
+ "architectures": [
6
+ "Gr00tN1d6"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_model_type": "eagle",
12
+ "backbone_trainable_params_fp32": true,
13
+ "collator_overwrite_image_inputs": false,
14
+ "color_jitter_params": {
15
+ "brightness": 0.1,
16
+ "contrast": 0.1,
17
+ "hue": 0.1,
18
+ "saturation": 0.1
19
+ },
20
+ "crop_fraction": 0.95,
21
+ "diffusion_model_cfg": {
22
+ "attention_head_dim": 48,
23
+ "dropout": 0.2,
24
+ "final_dropout": true,
25
+ "interleave_self_attention": true,
26
+ "norm_type": "ada_norm",
27
+ "num_attention_heads": 32,
28
+ "num_layers": 32,
29
+ "output_dim": 1024,
30
+ "positional_embeddings": null
31
+ },
32
+ "eagle_collator": true,
33
+ "formalize_language": true,
34
+ "gemma_collator": false,
35
+ "hidden_size": 1024,
36
+ "image_crop_size": null,
37
+ "image_target_size": null,
38
+ "input_embedding_dim": 1536,
39
+ "load_bf16": true,
40
+ "max_action_dim": 128,
41
+ "max_num_embodiments": 32,
42
+ "max_seq_len": 1024,
43
+ "max_state_dim": 128,
44
+ "model_dtype": "bfloat16",
45
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
+ "model_type": "Gr00tN1d6",
47
+ "noise_beta_alpha": 1.5,
48
+ "noise_beta_beta": 1.0,
49
+ "noise_s": 0.999,
50
+ "num_inference_timesteps": 4,
51
+ "num_timestep_buckets": 1000,
52
+ "random_rotation_angle": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 16,
55
+ "shortest_image_edge": 256,
56
+ "soft_prompt_lr_scale": 1.0,
57
+ "soft_prompt_num_tokens": 32,
58
+ "state_dropout_prob": 0.0,
59
+ "state_dropout_prob_per_embodiment": {
60
+ "cmr_versius": 1.0,
61
+ "jhu_imerse_dvrk": 1.0,
62
+ "obuda_dvrk": 1.0,
63
+ "stanford_dvrk_real": 1.0,
64
+ "ucb_dvrk": 1.0,
65
+ "ucsd_dvrk": 1.0,
66
+ "hamlyn_dvrk_30hz": 1.0,
67
+ "jhu_imerse_dvrk_mono": 1.0,
68
+ "jhu_imerse_star_il": 1.0,
69
+ "jhu_lscr_dvrk_smarts": 1.0,
70
+ "rob_surgical_bitrack": 1.0,
71
+ "tud_tundra_ur5e": 1.0,
72
+ "turin_mitic_ex_vivo": 1.0,
73
+ "ustc_torin_tuodao": 1.0
74
+ },
75
+ "torch_dtype": "bfloat16",
76
+ "transformers_version": "4.51.3",
77
+ "tune_diffusion_model": true,
78
+ "tune_llm": true,
79
+ "tune_projector": true,
80
+ "tune_top_llm_layers": 4,
81
+ "tune_visual": true,
82
+ "tune_vlln": true,
83
+ "use_albumentations_transforms": true,
84
+ "use_alternate_vl_dit": true,
85
+ "use_flash_attention": true,
86
+ "use_relative_action": true,
87
+ "use_soft_prompts": false,
88
+ "use_vlln": true
89
+ }
embodiment_id.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "new_embodiment": 10,
10
+ "jhu_imerse_dvrk": 3,
11
+ "cmr_versius": 4,
12
+ "ucb_dvrk": 5,
13
+ "sanoscience_sim": 6,
14
+ "tum_sonata_franka": 7,
15
+ "hamlyn_dvrk_15hz": 9,
16
+ "hamlyn_dvrk_30hz": 11,
17
+ "ustc_torin_tuodao": 12,
18
+ "ucsd_dvrk": 14,
19
+ "jhu_imerse_dvrk_mono": 15,
20
+ "rob_surgical_bitrack": 16,
21
+ "stanford_dvrk_real": 17,
22
+ "obuda_dvrk": 18,
23
+ "polyu_sim": 19,
24
+ "moon_maestro": 21,
25
+ "jhu_lscr_dvrk_miracle": 22,
26
+ "jhu_lscr_dvrk_smarts": 23,
27
+ "jhu_imerse_star_il": 27,
28
+ "tud_tundra_ur5e": 25,
29
+ "turin_mitic_ex_vivo": 26,
30
+ "oxe_droid": 29
31
+ }
media/gr00t-h.png ADDED
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae282371f11c02e2f3d69124511b63c0f7be79d93b482d20e331ff5f543905db
3
+ size 4991094767
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ca84dd2dba8125810bb5f82c1d731a836dba4497c2f10907dfe9f6f64052745
3
+ size 1582283096
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
processor_config.json ADDED
@@ -0,0 +1,2827 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "min_max_embedding_keys": null,
18
+ "pass_through_keys": null,
19
+ "action_configs": null
20
+ },
21
+ "state": {
22
+ "delta_indices": [
23
+ 0
24
+ ],
25
+ "modality_keys": [
26
+ "robot_pos",
27
+ "robot_ori_cos",
28
+ "robot_ori_sin",
29
+ "robot_2d_ori",
30
+ "robot_2d_ori_cos",
31
+ "robot_2d_ori_sin",
32
+ "robot_lin_vel",
33
+ "robot_ang_vel",
34
+ "arm_left_qpos",
35
+ "arm_left_qpos_sin",
36
+ "arm_left_qpos_cos",
37
+ "eef_left_pos",
38
+ "eef_left_quat",
39
+ "gripper_left_qpos",
40
+ "arm_right_qpos",
41
+ "arm_right_qpos_sin",
42
+ "arm_right_qpos_cos",
43
+ "eef_right_pos",
44
+ "eef_right_quat",
45
+ "gripper_right_qpos",
46
+ "trunk_qpos"
47
+ ],
48
+ "sin_cos_embedding_keys": null,
49
+ "mean_std_embedding_keys": null,
50
+ "min_max_embedding_keys": null,
51
+ "pass_through_keys": null,
52
+ "action_configs": null
53
+ },
54
+ "action": {
55
+ "delta_indices": [
56
+ 0,
57
+ 1,
58
+ 2,
59
+ 3,
60
+ 4,
61
+ 5,
62
+ 6,
63
+ 7,
64
+ 8,
65
+ 9,
66
+ 10,
67
+ 11,
68
+ 12,
69
+ 13,
70
+ 14,
71
+ 15,
72
+ 16,
73
+ 17,
74
+ 18,
75
+ 19,
76
+ 20,
77
+ 21,
78
+ 22,
79
+ 23,
80
+ 24,
81
+ 25,
82
+ 26,
83
+ 27,
84
+ 28,
85
+ 29,
86
+ 30,
87
+ 31
88
+ ],
89
+ "modality_keys": [
90
+ "base",
91
+ "torso",
92
+ "left_arm",
93
+ "left_gripper",
94
+ "right_arm",
95
+ "right_gripper"
96
+ ],
97
+ "sin_cos_embedding_keys": null,
98
+ "mean_std_embedding_keys": null,
99
+ "min_max_embedding_keys": null,
100
+ "pass_through_keys": null,
101
+ "action_configs": [
102
+ {
103
+ "rep": "ABSOLUTE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": null,
107
+ "normalization_type": "percentile",
108
+ "input_rotation_format": "quat",
109
+ "input_quat_order": "xyzw",
110
+ "reference_rotation_format": "rot6d",
111
+ "reference_quat_order": "xyzw",
112
+ "translation_scaling_key": null,
113
+ "rotation_scaling_key": null,
114
+ "hold_through_clutch": false
115
+ },
116
+ {
117
+ "rep": "RELATIVE",
118
+ "type": "NON_EEF",
119
+ "format": "DEFAULT",
120
+ "state_key": "trunk_qpos",
121
+ "normalization_type": "percentile",
122
+ "input_rotation_format": "quat",
123
+ "input_quat_order": "xyzw",
124
+ "reference_rotation_format": "rot6d",
125
+ "reference_quat_order": "xyzw",
126
+ "translation_scaling_key": null,
127
+ "rotation_scaling_key": null,
128
+ "hold_through_clutch": false
129
+ },
130
+ {
131
+ "rep": "RELATIVE",
132
+ "type": "NON_EEF",
133
+ "format": "DEFAULT",
134
+ "state_key": "arm_left_qpos",
135
+ "normalization_type": "percentile",
136
+ "input_rotation_format": "quat",
137
+ "input_quat_order": "xyzw",
138
+ "reference_rotation_format": "rot6d",
139
+ "reference_quat_order": "xyzw",
140
+ "translation_scaling_key": null,
141
+ "rotation_scaling_key": null,
142
+ "hold_through_clutch": false
143
+ },
144
+ {
145
+ "rep": "ABSOLUTE",
146
+ "type": "NON_EEF",
147
+ "format": "DEFAULT",
148
+ "state_key": null,
149
+ "normalization_type": "percentile",
150
+ "input_rotation_format": "quat",
151
+ "input_quat_order": "xyzw",
152
+ "reference_rotation_format": "rot6d",
153
+ "reference_quat_order": "xyzw",
154
+ "translation_scaling_key": null,
155
+ "rotation_scaling_key": null,
156
+ "hold_through_clutch": false
157
+ },
158
+ {
159
+ "rep": "RELATIVE",
160
+ "type": "NON_EEF",
161
+ "format": "DEFAULT",
162
+ "state_key": "arm_right_qpos",
163
+ "normalization_type": "percentile",
164
+ "input_rotation_format": "quat",
165
+ "input_quat_order": "xyzw",
166
+ "reference_rotation_format": "rot6d",
167
+ "reference_quat_order": "xyzw",
168
+ "translation_scaling_key": null,
169
+ "rotation_scaling_key": null,
170
+ "hold_through_clutch": false
171
+ },
172
+ {
173
+ "rep": "ABSOLUTE",
174
+ "type": "NON_EEF",
175
+ "format": "DEFAULT",
176
+ "state_key": null,
177
+ "normalization_type": "percentile",
178
+ "input_rotation_format": "quat",
179
+ "input_quat_order": "xyzw",
180
+ "reference_rotation_format": "rot6d",
181
+ "reference_quat_order": "xyzw",
182
+ "translation_scaling_key": null,
183
+ "rotation_scaling_key": null,
184
+ "hold_through_clutch": false
185
+ }
186
+ ]
187
+ },
188
+ "language": {
189
+ "delta_indices": [
190
+ 0
191
+ ],
192
+ "modality_keys": [
193
+ "annotation.human.coarse_action"
194
+ ],
195
+ "sin_cos_embedding_keys": null,
196
+ "mean_std_embedding_keys": null,
197
+ "min_max_embedding_keys": null,
198
+ "pass_through_keys": null,
199
+ "action_configs": null
200
+ }
201
+ },
202
+ "gr1": {
203
+ "video": {
204
+ "delta_indices": [
205
+ 0
206
+ ],
207
+ "modality_keys": [
208
+ "ego_view_bg_crop_pad_res256_freq20"
209
+ ],
210
+ "sin_cos_embedding_keys": null,
211
+ "mean_std_embedding_keys": null,
212
+ "min_max_embedding_keys": null,
213
+ "pass_through_keys": null,
214
+ "action_configs": null
215
+ },
216
+ "state": {
217
+ "delta_indices": [
218
+ 0
219
+ ],
220
+ "modality_keys": [
221
+ "left_arm",
222
+ "right_arm",
223
+ "left_hand",
224
+ "right_hand",
225
+ "waist"
226
+ ],
227
+ "sin_cos_embedding_keys": [
228
+ "left_arm",
229
+ "right_arm",
230
+ "left_hand",
231
+ "right_hand",
232
+ "waist"
233
+ ],
234
+ "mean_std_embedding_keys": null,
235
+ "min_max_embedding_keys": null,
236
+ "pass_through_keys": null,
237
+ "action_configs": null
238
+ },
239
+ "action": {
240
+ "delta_indices": [
241
+ 0,
242
+ 1,
243
+ 2,
244
+ 3,
245
+ 4,
246
+ 5,
247
+ 6,
248
+ 7,
249
+ 8,
250
+ 9,
251
+ 10,
252
+ 11,
253
+ 12,
254
+ 13,
255
+ 14,
256
+ 15
257
+ ],
258
+ "modality_keys": [
259
+ "left_arm",
260
+ "right_arm",
261
+ "left_hand",
262
+ "right_hand",
263
+ "waist"
264
+ ],
265
+ "sin_cos_embedding_keys": null,
266
+ "mean_std_embedding_keys": null,
267
+ "min_max_embedding_keys": null,
268
+ "pass_through_keys": null,
269
+ "action_configs": [
270
+ {
271
+ "rep": "RELATIVE",
272
+ "type": "NON_EEF",
273
+ "format": "DEFAULT",
274
+ "state_key": null,
275
+ "normalization_type": "percentile",
276
+ "input_rotation_format": "quat",
277
+ "input_quat_order": "xyzw",
278
+ "reference_rotation_format": "rot6d",
279
+ "reference_quat_order": "xyzw",
280
+ "translation_scaling_key": null,
281
+ "rotation_scaling_key": null,
282
+ "hold_through_clutch": false
283
+ },
284
+ {
285
+ "rep": "RELATIVE",
286
+ "type": "NON_EEF",
287
+ "format": "DEFAULT",
288
+ "state_key": null,
289
+ "normalization_type": "percentile",
290
+ "input_rotation_format": "quat",
291
+ "input_quat_order": "xyzw",
292
+ "reference_rotation_format": "rot6d",
293
+ "reference_quat_order": "xyzw",
294
+ "translation_scaling_key": null,
295
+ "rotation_scaling_key": null,
296
+ "hold_through_clutch": false
297
+ },
298
+ {
299
+ "rep": "RELATIVE",
300
+ "type": "NON_EEF",
301
+ "format": "DEFAULT",
302
+ "state_key": null,
303
+ "normalization_type": "percentile",
304
+ "input_rotation_format": "quat",
305
+ "input_quat_order": "xyzw",
306
+ "reference_rotation_format": "rot6d",
307
+ "reference_quat_order": "xyzw",
308
+ "translation_scaling_key": null,
309
+ "rotation_scaling_key": null,
310
+ "hold_through_clutch": false
311
+ },
312
+ {
313
+ "rep": "RELATIVE",
314
+ "type": "NON_EEF",
315
+ "format": "DEFAULT",
316
+ "state_key": null,
317
+ "normalization_type": "percentile",
318
+ "input_rotation_format": "quat",
319
+ "input_quat_order": "xyzw",
320
+ "reference_rotation_format": "rot6d",
321
+ "reference_quat_order": "xyzw",
322
+ "translation_scaling_key": null,
323
+ "rotation_scaling_key": null,
324
+ "hold_through_clutch": false
325
+ },
326
+ {
327
+ "rep": "ABSOLUTE",
328
+ "type": "NON_EEF",
329
+ "format": "DEFAULT",
330
+ "state_key": null,
331
+ "normalization_type": "percentile",
332
+ "input_rotation_format": "quat",
333
+ "input_quat_order": "xyzw",
334
+ "reference_rotation_format": "rot6d",
335
+ "reference_quat_order": "xyzw",
336
+ "translation_scaling_key": null,
337
+ "rotation_scaling_key": null,
338
+ "hold_through_clutch": false
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "task"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "min_max_embedding_keys": null,
352
+ "pass_through_keys": null,
353
+ "action_configs": null
354
+ }
355
+ },
356
+ "robocasa_panda_omron": {
357
+ "video": {
358
+ "delta_indices": [
359
+ 0
360
+ ],
361
+ "modality_keys": [
362
+ "res256_image_side_0",
363
+ "res256_image_side_1",
364
+ "res256_image_wrist_0"
365
+ ],
366
+ "sin_cos_embedding_keys": null,
367
+ "mean_std_embedding_keys": null,
368
+ "min_max_embedding_keys": null,
369
+ "pass_through_keys": null,
370
+ "action_configs": null
371
+ },
372
+ "state": {
373
+ "delta_indices": [
374
+ 0
375
+ ],
376
+ "modality_keys": [
377
+ "end_effector_position_relative",
378
+ "end_effector_rotation_relative",
379
+ "gripper_qpos",
380
+ "base_position",
381
+ "base_rotation"
382
+ ],
383
+ "sin_cos_embedding_keys": null,
384
+ "mean_std_embedding_keys": null,
385
+ "min_max_embedding_keys": null,
386
+ "pass_through_keys": null,
387
+ "action_configs": null
388
+ },
389
+ "action": {
390
+ "delta_indices": [
391
+ 0,
392
+ 1,
393
+ 2,
394
+ 3,
395
+ 4,
396
+ 5,
397
+ 6,
398
+ 7,
399
+ 8,
400
+ 9,
401
+ 10,
402
+ 11,
403
+ 12,
404
+ 13,
405
+ 14,
406
+ 15
407
+ ],
408
+ "modality_keys": [
409
+ "end_effector_position",
410
+ "end_effector_rotation",
411
+ "gripper_close",
412
+ "base_motion",
413
+ "control_mode"
414
+ ],
415
+ "sin_cos_embedding_keys": null,
416
+ "mean_std_embedding_keys": null,
417
+ "min_max_embedding_keys": null,
418
+ "pass_through_keys": null,
419
+ "action_configs": [
420
+ {
421
+ "rep": "ABSOLUTE",
422
+ "type": "NON_EEF",
423
+ "format": "DEFAULT",
424
+ "state_key": null,
425
+ "normalization_type": "percentile",
426
+ "input_rotation_format": "quat",
427
+ "input_quat_order": "xyzw",
428
+ "reference_rotation_format": "rot6d",
429
+ "reference_quat_order": "xyzw",
430
+ "translation_scaling_key": null,
431
+ "rotation_scaling_key": null,
432
+ "hold_through_clutch": false
433
+ },
434
+ {
435
+ "rep": "ABSOLUTE",
436
+ "type": "NON_EEF",
437
+ "format": "DEFAULT",
438
+ "state_key": null,
439
+ "normalization_type": "percentile",
440
+ "input_rotation_format": "quat",
441
+ "input_quat_order": "xyzw",
442
+ "reference_rotation_format": "rot6d",
443
+ "reference_quat_order": "xyzw",
444
+ "translation_scaling_key": null,
445
+ "rotation_scaling_key": null,
446
+ "hold_through_clutch": false
447
+ },
448
+ {
449
+ "rep": "ABSOLUTE",
450
+ "type": "NON_EEF",
451
+ "format": "DEFAULT",
452
+ "state_key": null,
453
+ "normalization_type": "percentile",
454
+ "input_rotation_format": "quat",
455
+ "input_quat_order": "xyzw",
456
+ "reference_rotation_format": "rot6d",
457
+ "reference_quat_order": "xyzw",
458
+ "translation_scaling_key": null,
459
+ "rotation_scaling_key": null,
460
+ "hold_through_clutch": false
461
+ },
462
+ {
463
+ "rep": "ABSOLUTE",
464
+ "type": "NON_EEF",
465
+ "format": "DEFAULT",
466
+ "state_key": null,
467
+ "normalization_type": "percentile",
468
+ "input_rotation_format": "quat",
469
+ "input_quat_order": "xyzw",
470
+ "reference_rotation_format": "rot6d",
471
+ "reference_quat_order": "xyzw",
472
+ "translation_scaling_key": null,
473
+ "rotation_scaling_key": null,
474
+ "hold_through_clutch": false
475
+ },
476
+ {
477
+ "rep": "ABSOLUTE",
478
+ "type": "NON_EEF",
479
+ "format": "DEFAULT",
480
+ "state_key": null,
481
+ "normalization_type": "percentile",
482
+ "input_rotation_format": "quat",
483
+ "input_quat_order": "xyzw",
484
+ "reference_rotation_format": "rot6d",
485
+ "reference_quat_order": "xyzw",
486
+ "translation_scaling_key": null,
487
+ "rotation_scaling_key": null,
488
+ "hold_through_clutch": false
489
+ }
490
+ ]
491
+ },
492
+ "language": {
493
+ "delta_indices": [
494
+ 0
495
+ ],
496
+ "modality_keys": [
497
+ "annotation.human.action.task_description"
498
+ ],
499
+ "sin_cos_embedding_keys": null,
500
+ "mean_std_embedding_keys": null,
501
+ "min_max_embedding_keys": null,
502
+ "pass_through_keys": null,
503
+ "action_configs": null
504
+ }
505
+ },
506
+ "cmr_versius": {
507
+ "video": {
508
+ "delta_indices": [
509
+ 0
510
+ ],
511
+ "modality_keys": [
512
+ "endoscope"
513
+ ],
514
+ "sin_cos_embedding_keys": null,
515
+ "mean_std_embedding_keys": null,
516
+ "min_max_embedding_keys": null,
517
+ "pass_through_keys": null,
518
+ "action_configs": null
519
+ },
520
+ "state": {
521
+ "delta_indices": [
522
+ 0
523
+ ],
524
+ "modality_keys": [
525
+ "left_pose",
526
+ "left_gripper",
527
+ "right_pose",
528
+ "right_gripper",
529
+ "translation_scaling",
530
+ "rotation_scaling",
531
+ "hapticengaged_left",
532
+ "hapticengaged_right"
533
+ ],
534
+ "sin_cos_embedding_keys": null,
535
+ "mean_std_embedding_keys": [
536
+ "left_pose",
537
+ "left_gripper",
538
+ "right_pose",
539
+ "right_gripper"
540
+ ],
541
+ "min_max_embedding_keys": null,
542
+ "pass_through_keys": [
543
+ "translation_scaling",
544
+ "rotation_scaling",
545
+ "hapticengaged_left",
546
+ "hapticengaged_right"
547
+ ],
548
+ "action_configs": null
549
+ },
550
+ "action": {
551
+ "delta_indices": [
552
+ 2,
553
+ 4,
554
+ 6,
555
+ 8,
556
+ 10,
557
+ 12,
558
+ 14,
559
+ 16,
560
+ 18,
561
+ 20,
562
+ 22,
563
+ 24,
564
+ 26,
565
+ 28,
566
+ 30,
567
+ 32,
568
+ 34,
569
+ 36,
570
+ 38,
571
+ 40,
572
+ 42,
573
+ 44,
574
+ 46,
575
+ 48,
576
+ 50,
577
+ 52,
578
+ 54,
579
+ 56,
580
+ 58,
581
+ 60,
582
+ 62,
583
+ 64,
584
+ 66,
585
+ 68,
586
+ 70,
587
+ 72,
588
+ 74,
589
+ 76,
590
+ 78,
591
+ 80,
592
+ 82,
593
+ 84,
594
+ 86,
595
+ 88,
596
+ 90,
597
+ 92,
598
+ 94,
599
+ 96,
600
+ 98,
601
+ 100
602
+ ],
603
+ "modality_keys": [
604
+ "left_pose",
605
+ "left_gripper",
606
+ "right_pose",
607
+ "right_gripper",
608
+ "hapticengaged_left",
609
+ "hapticengaged_right"
610
+ ],
611
+ "sin_cos_embedding_keys": null,
612
+ "mean_std_embedding_keys": null,
613
+ "min_max_embedding_keys": null,
614
+ "pass_through_keys": [
615
+ "hapticengaged_left",
616
+ "hapticengaged_right"
617
+ ],
618
+ "action_configs": [
619
+ {
620
+ "rep": "REL_XYZ_ROT6D",
621
+ "type": "EEF",
622
+ "format": "XYZ_ROT6D",
623
+ "state_key": "left_pose",
624
+ "normalization_type": "percentile",
625
+ "input_rotation_format": "quat",
626
+ "input_quat_order": "xyzw",
627
+ "reference_rotation_format": "quat",
628
+ "reference_quat_order": "xyzw",
629
+ "translation_scaling_key": "translation_scaling",
630
+ "rotation_scaling_key": "rotation_scaling",
631
+ "hold_through_clutch": false
632
+ },
633
+ {
634
+ "rep": "ABSOLUTE",
635
+ "type": "NON_EEF",
636
+ "format": "DEFAULT",
637
+ "state_key": "left_gripper",
638
+ "normalization_type": "percentile",
639
+ "input_rotation_format": "quat",
640
+ "input_quat_order": "xyzw",
641
+ "reference_rotation_format": "rot6d",
642
+ "reference_quat_order": "xyzw",
643
+ "translation_scaling_key": null,
644
+ "rotation_scaling_key": null,
645
+ "hold_through_clutch": true
646
+ },
647
+ {
648
+ "rep": "REL_XYZ_ROT6D",
649
+ "type": "EEF",
650
+ "format": "XYZ_ROT6D",
651
+ "state_key": "right_pose",
652
+ "normalization_type": "percentile",
653
+ "input_rotation_format": "quat",
654
+ "input_quat_order": "xyzw",
655
+ "reference_rotation_format": "quat",
656
+ "reference_quat_order": "xyzw",
657
+ "translation_scaling_key": "translation_scaling",
658
+ "rotation_scaling_key": "rotation_scaling",
659
+ "hold_through_clutch": false
660
+ },
661
+ {
662
+ "rep": "ABSOLUTE",
663
+ "type": "NON_EEF",
664
+ "format": "DEFAULT",
665
+ "state_key": "right_gripper",
666
+ "normalization_type": "percentile",
667
+ "input_rotation_format": "quat",
668
+ "input_quat_order": "xyzw",
669
+ "reference_rotation_format": "rot6d",
670
+ "reference_quat_order": "xyzw",
671
+ "translation_scaling_key": null,
672
+ "rotation_scaling_key": null,
673
+ "hold_through_clutch": true
674
+ },
675
+ {
676
+ "rep": "ABSOLUTE",
677
+ "type": "NON_EEF",
678
+ "format": "DEFAULT",
679
+ "state_key": null,
680
+ "normalization_type": "skip",
681
+ "input_rotation_format": "quat",
682
+ "input_quat_order": "xyzw",
683
+ "reference_rotation_format": "rot6d",
684
+ "reference_quat_order": "xyzw",
685
+ "translation_scaling_key": null,
686
+ "rotation_scaling_key": null,
687
+ "hold_through_clutch": false
688
+ },
689
+ {
690
+ "rep": "ABSOLUTE",
691
+ "type": "NON_EEF",
692
+ "format": "DEFAULT",
693
+ "state_key": null,
694
+ "normalization_type": "skip",
695
+ "input_rotation_format": "quat",
696
+ "input_quat_order": "xyzw",
697
+ "reference_rotation_format": "rot6d",
698
+ "reference_quat_order": "xyzw",
699
+ "translation_scaling_key": null,
700
+ "rotation_scaling_key": null,
701
+ "hold_through_clutch": false
702
+ }
703
+ ]
704
+ },
705
+ "language": {
706
+ "delta_indices": [
707
+ 0
708
+ ],
709
+ "modality_keys": [
710
+ "annotation.human.task_description"
711
+ ],
712
+ "sin_cos_embedding_keys": null,
713
+ "mean_std_embedding_keys": null,
714
+ "min_max_embedding_keys": null,
715
+ "pass_through_keys": null,
716
+ "action_configs": null
717
+ }
718
+ },
719
+ "ucsd_dvrk": {
720
+ "video": {
721
+ "delta_indices": [
722
+ 0
723
+ ],
724
+ "modality_keys": [
725
+ "camera_left"
726
+ ],
727
+ "sin_cos_embedding_keys": null,
728
+ "mean_std_embedding_keys": null,
729
+ "min_max_embedding_keys": null,
730
+ "pass_through_keys": null,
731
+ "action_configs": null
732
+ },
733
+ "state": {
734
+ "delta_indices": [
735
+ 0
736
+ ],
737
+ "modality_keys": [
738
+ "psm_retraction_pose",
739
+ "psm_retraction_gripper",
740
+ "psm_cutter_pose",
741
+ "psm_cutter_gripper"
742
+ ],
743
+ "sin_cos_embedding_keys": null,
744
+ "mean_std_embedding_keys": [
745
+ "psm_retraction_pose",
746
+ "psm_retraction_gripper",
747
+ "psm_cutter_pose",
748
+ "psm_cutter_gripper"
749
+ ],
750
+ "min_max_embedding_keys": null,
751
+ "pass_through_keys": null,
752
+ "action_configs": null
753
+ },
754
+ "action": {
755
+ "delta_indices": [
756
+ 1,
757
+ 2,
758
+ 3,
759
+ 4,
760
+ 5,
761
+ 6,
762
+ 7,
763
+ 8,
764
+ 9,
765
+ 10,
766
+ 11,
767
+ 12,
768
+ 13,
769
+ 14,
770
+ 15,
771
+ 16,
772
+ 17,
773
+ 18,
774
+ 19,
775
+ 20,
776
+ 21,
777
+ 22,
778
+ 23,
779
+ 24,
780
+ 25,
781
+ 26,
782
+ 27,
783
+ 28,
784
+ 29,
785
+ 30,
786
+ 31,
787
+ 32,
788
+ 33,
789
+ 34,
790
+ 35,
791
+ 36,
792
+ 37,
793
+ 38,
794
+ 39,
795
+ 40,
796
+ 41,
797
+ 42,
798
+ 43,
799
+ 44,
800
+ 45,
801
+ 46,
802
+ 47,
803
+ 48,
804
+ 49,
805
+ 50
806
+ ],
807
+ "modality_keys": [
808
+ "psm_retraction_pose",
809
+ "psm_retraction_gripper",
810
+ "psm_cutter_pose",
811
+ "psm_cutter_gripper"
812
+ ],
813
+ "sin_cos_embedding_keys": null,
814
+ "mean_std_embedding_keys": null,
815
+ "min_max_embedding_keys": null,
816
+ "pass_through_keys": null,
817
+ "action_configs": [
818
+ {
819
+ "rep": "REL_XYZ_ROT6D",
820
+ "type": "EEF",
821
+ "format": "XYZ_ROT6D",
822
+ "state_key": "psm_retraction_pose",
823
+ "normalization_type": "percentile",
824
+ "input_rotation_format": "quat",
825
+ "input_quat_order": "wxyz",
826
+ "reference_rotation_format": "quat",
827
+ "reference_quat_order": "wxyz",
828
+ "translation_scaling_key": null,
829
+ "rotation_scaling_key": null,
830
+ "hold_through_clutch": false
831
+ },
832
+ {
833
+ "rep": "ABSOLUTE",
834
+ "type": "NON_EEF",
835
+ "format": "DEFAULT",
836
+ "state_key": null,
837
+ "normalization_type": "percentile",
838
+ "input_rotation_format": "quat",
839
+ "input_quat_order": "xyzw",
840
+ "reference_rotation_format": "rot6d",
841
+ "reference_quat_order": "xyzw",
842
+ "translation_scaling_key": null,
843
+ "rotation_scaling_key": null,
844
+ "hold_through_clutch": false
845
+ },
846
+ {
847
+ "rep": "REL_XYZ_ROT6D",
848
+ "type": "EEF",
849
+ "format": "XYZ_ROT6D",
850
+ "state_key": "psm_cutter_pose",
851
+ "normalization_type": "percentile",
852
+ "input_rotation_format": "quat",
853
+ "input_quat_order": "wxyz",
854
+ "reference_rotation_format": "quat",
855
+ "reference_quat_order": "wxyz",
856
+ "translation_scaling_key": null,
857
+ "rotation_scaling_key": null,
858
+ "hold_through_clutch": false
859
+ },
860
+ {
861
+ "rep": "ABSOLUTE",
862
+ "type": "NON_EEF",
863
+ "format": "DEFAULT",
864
+ "state_key": null,
865
+ "normalization_type": "percentile",
866
+ "input_rotation_format": "quat",
867
+ "input_quat_order": "xyzw",
868
+ "reference_rotation_format": "rot6d",
869
+ "reference_quat_order": "xyzw",
870
+ "translation_scaling_key": null,
871
+ "rotation_scaling_key": null,
872
+ "hold_through_clutch": false
873
+ }
874
+ ]
875
+ },
876
+ "language": {
877
+ "delta_indices": [
878
+ 0
879
+ ],
880
+ "modality_keys": [
881
+ "task"
882
+ ],
883
+ "sin_cos_embedding_keys": null,
884
+ "mean_std_embedding_keys": null,
885
+ "min_max_embedding_keys": null,
886
+ "pass_through_keys": null,
887
+ "action_configs": null
888
+ }
889
+ },
890
+ "jhu_imerse_dvrk": {
891
+ "video": {
892
+ "delta_indices": [
893
+ 0
894
+ ],
895
+ "modality_keys": [
896
+ "endoscope_left",
897
+ "wrist_left",
898
+ "wrist_right"
899
+ ],
900
+ "sin_cos_embedding_keys": null,
901
+ "mean_std_embedding_keys": null,
902
+ "min_max_embedding_keys": null,
903
+ "pass_through_keys": null,
904
+ "action_configs": null
905
+ },
906
+ "state": {
907
+ "delta_indices": [
908
+ 0
909
+ ],
910
+ "modality_keys": [
911
+ "psm1_pose",
912
+ "psm1_gripper",
913
+ "psm2_pose",
914
+ "psm2_gripper"
915
+ ],
916
+ "sin_cos_embedding_keys": null,
917
+ "mean_std_embedding_keys": [
918
+ "psm1_pose",
919
+ "psm1_gripper",
920
+ "psm2_pose",
921
+ "psm2_gripper"
922
+ ],
923
+ "min_max_embedding_keys": null,
924
+ "pass_through_keys": null,
925
+ "action_configs": null
926
+ },
927
+ "action": {
928
+ "delta_indices": [
929
+ 1,
930
+ 2,
931
+ 3,
932
+ 4,
933
+ 5,
934
+ 6,
935
+ 7,
936
+ 8,
937
+ 9,
938
+ 10,
939
+ 11,
940
+ 12,
941
+ 13,
942
+ 14,
943
+ 15,
944
+ 16,
945
+ 17,
946
+ 18,
947
+ 19,
948
+ 20,
949
+ 21,
950
+ 22,
951
+ 23,
952
+ 24,
953
+ 25,
954
+ 26,
955
+ 27,
956
+ 28,
957
+ 29,
958
+ 30,
959
+ 31,
960
+ 32,
961
+ 33,
962
+ 34,
963
+ 35,
964
+ 36,
965
+ 37,
966
+ 38,
967
+ 39,
968
+ 40,
969
+ 41,
970
+ 42,
971
+ 43,
972
+ 44,
973
+ 45,
974
+ 46,
975
+ 47,
976
+ 48,
977
+ 49,
978
+ 50
979
+ ],
980
+ "modality_keys": [
981
+ "psm1_pose",
982
+ "psm1_gripper",
983
+ "psm2_pose",
984
+ "psm2_gripper"
985
+ ],
986
+ "sin_cos_embedding_keys": null,
987
+ "mean_std_embedding_keys": null,
988
+ "min_max_embedding_keys": null,
989
+ "pass_through_keys": null,
990
+ "action_configs": [
991
+ {
992
+ "rep": "REL_XYZ_ROT6D",
993
+ "type": "EEF",
994
+ "format": "XYZ_ROT6D",
995
+ "state_key": "psm1_pose",
996
+ "normalization_type": "percentile",
997
+ "input_rotation_format": "quat",
998
+ "input_quat_order": "xyzw",
999
+ "reference_rotation_format": "quat",
1000
+ "reference_quat_order": "xyzw",
1001
+ "translation_scaling_key": null,
1002
+ "rotation_scaling_key": null,
1003
+ "hold_through_clutch": false
1004
+ },
1005
+ {
1006
+ "rep": "ABSOLUTE",
1007
+ "type": "NON_EEF",
1008
+ "format": "DEFAULT",
1009
+ "state_key": null,
1010
+ "normalization_type": "percentile",
1011
+ "input_rotation_format": "quat",
1012
+ "input_quat_order": "xyzw",
1013
+ "reference_rotation_format": "rot6d",
1014
+ "reference_quat_order": "xyzw",
1015
+ "translation_scaling_key": null,
1016
+ "rotation_scaling_key": null,
1017
+ "hold_through_clutch": false
1018
+ },
1019
+ {
1020
+ "rep": "REL_XYZ_ROT6D",
1021
+ "type": "EEF",
1022
+ "format": "XYZ_ROT6D",
1023
+ "state_key": "psm2_pose",
1024
+ "normalization_type": "percentile",
1025
+ "input_rotation_format": "quat",
1026
+ "input_quat_order": "xyzw",
1027
+ "reference_rotation_format": "quat",
1028
+ "reference_quat_order": "xyzw",
1029
+ "translation_scaling_key": null,
1030
+ "rotation_scaling_key": null,
1031
+ "hold_through_clutch": false
1032
+ },
1033
+ {
1034
+ "rep": "ABSOLUTE",
1035
+ "type": "NON_EEF",
1036
+ "format": "DEFAULT",
1037
+ "state_key": null,
1038
+ "normalization_type": "percentile",
1039
+ "input_rotation_format": "quat",
1040
+ "input_quat_order": "xyzw",
1041
+ "reference_rotation_format": "rot6d",
1042
+ "reference_quat_order": "xyzw",
1043
+ "translation_scaling_key": null,
1044
+ "rotation_scaling_key": null,
1045
+ "hold_through_clutch": false
1046
+ }
1047
+ ]
1048
+ },
1049
+ "language": {
1050
+ "delta_indices": [
1051
+ 0
1052
+ ],
1053
+ "modality_keys": [
1054
+ "annotation.human.task_description"
1055
+ ],
1056
+ "sin_cos_embedding_keys": null,
1057
+ "mean_std_embedding_keys": null,
1058
+ "min_max_embedding_keys": null,
1059
+ "pass_through_keys": null,
1060
+ "action_configs": null
1061
+ }
1062
+ },
1063
+ "obuda_dvrk": {
1064
+ "video": {
1065
+ "delta_indices": [
1066
+ 0
1067
+ ],
1068
+ "modality_keys": [
1069
+ "endoscope_left",
1070
+ "wrist_left",
1071
+ "wrist_right"
1072
+ ],
1073
+ "sin_cos_embedding_keys": null,
1074
+ "mean_std_embedding_keys": null,
1075
+ "min_max_embedding_keys": null,
1076
+ "pass_through_keys": null,
1077
+ "action_configs": null
1078
+ },
1079
+ "state": {
1080
+ "delta_indices": [
1081
+ 0
1082
+ ],
1083
+ "modality_keys": [
1084
+ "psm1_pose",
1085
+ "psm1_gripper",
1086
+ "psm2_pose",
1087
+ "psm2_gripper"
1088
+ ],
1089
+ "sin_cos_embedding_keys": null,
1090
+ "mean_std_embedding_keys": [
1091
+ "psm1_pose",
1092
+ "psm1_gripper",
1093
+ "psm2_pose",
1094
+ "psm2_gripper"
1095
+ ],
1096
+ "min_max_embedding_keys": null,
1097
+ "pass_through_keys": null,
1098
+ "action_configs": null
1099
+ },
1100
+ "action": {
1101
+ "delta_indices": [
1102
+ 0,
1103
+ 1,
1104
+ 2,
1105
+ 3,
1106
+ 4,
1107
+ 5,
1108
+ 6,
1109
+ 7,
1110
+ 8,
1111
+ 9,
1112
+ 10,
1113
+ 11,
1114
+ 12,
1115
+ 13,
1116
+ 14,
1117
+ 15,
1118
+ 16,
1119
+ 17,
1120
+ 18,
1121
+ 19,
1122
+ 20,
1123
+ 21,
1124
+ 22,
1125
+ 23,
1126
+ 24,
1127
+ 25,
1128
+ 26,
1129
+ 27,
1130
+ 28,
1131
+ 29,
1132
+ 30,
1133
+ 31,
1134
+ 32,
1135
+ 33,
1136
+ 34,
1137
+ 35,
1138
+ 36,
1139
+ 37,
1140
+ 38,
1141
+ 39,
1142
+ 40,
1143
+ 41,
1144
+ 42,
1145
+ 43,
1146
+ 44,
1147
+ 45,
1148
+ 46,
1149
+ 47,
1150
+ 48,
1151
+ 49
1152
+ ],
1153
+ "modality_keys": [
1154
+ "psm1_pose",
1155
+ "psm1_gripper",
1156
+ "psm2_pose",
1157
+ "psm2_gripper"
1158
+ ],
1159
+ "sin_cos_embedding_keys": null,
1160
+ "mean_std_embedding_keys": null,
1161
+ "min_max_embedding_keys": null,
1162
+ "pass_through_keys": null,
1163
+ "action_configs": [
1164
+ {
1165
+ "rep": "REL_XYZ_ROT6D",
1166
+ "type": "EEF",
1167
+ "format": "XYZ_ROT6D",
1168
+ "state_key": "psm1_pose",
1169
+ "normalization_type": "percentile",
1170
+ "input_rotation_format": "quat",
1171
+ "input_quat_order": "xyzw",
1172
+ "reference_rotation_format": "quat",
1173
+ "reference_quat_order": "xyzw",
1174
+ "translation_scaling_key": null,
1175
+ "rotation_scaling_key": null,
1176
+ "hold_through_clutch": false
1177
+ },
1178
+ {
1179
+ "rep": "ABSOLUTE",
1180
+ "type": "NON_EEF",
1181
+ "format": "DEFAULT",
1182
+ "state_key": null,
1183
+ "normalization_type": "percentile",
1184
+ "input_rotation_format": "quat",
1185
+ "input_quat_order": "xyzw",
1186
+ "reference_rotation_format": "rot6d",
1187
+ "reference_quat_order": "xyzw",
1188
+ "translation_scaling_key": null,
1189
+ "rotation_scaling_key": null,
1190
+ "hold_through_clutch": false
1191
+ },
1192
+ {
1193
+ "rep": "REL_XYZ_ROT6D",
1194
+ "type": "EEF",
1195
+ "format": "XYZ_ROT6D",
1196
+ "state_key": "psm2_pose",
1197
+ "normalization_type": "percentile",
1198
+ "input_rotation_format": "quat",
1199
+ "input_quat_order": "xyzw",
1200
+ "reference_rotation_format": "quat",
1201
+ "reference_quat_order": "xyzw",
1202
+ "translation_scaling_key": null,
1203
+ "rotation_scaling_key": null,
1204
+ "hold_through_clutch": false
1205
+ },
1206
+ {
1207
+ "rep": "ABSOLUTE",
1208
+ "type": "NON_EEF",
1209
+ "format": "DEFAULT",
1210
+ "state_key": null,
1211
+ "normalization_type": "percentile",
1212
+ "input_rotation_format": "quat",
1213
+ "input_quat_order": "xyzw",
1214
+ "reference_rotation_format": "rot6d",
1215
+ "reference_quat_order": "xyzw",
1216
+ "translation_scaling_key": null,
1217
+ "rotation_scaling_key": null,
1218
+ "hold_through_clutch": false
1219
+ }
1220
+ ]
1221
+ },
1222
+ "language": {
1223
+ "delta_indices": [
1224
+ 0
1225
+ ],
1226
+ "modality_keys": [
1227
+ "task"
1228
+ ],
1229
+ "sin_cos_embedding_keys": null,
1230
+ "mean_std_embedding_keys": null,
1231
+ "min_max_embedding_keys": null,
1232
+ "pass_through_keys": null,
1233
+ "action_configs": null
1234
+ }
1235
+ },
1236
+ "stanford_dvrk_real": {
1237
+ "video": {
1238
+ "delta_indices": [
1239
+ 0
1240
+ ],
1241
+ "modality_keys": [
1242
+ "endoscope_left"
1243
+ ],
1244
+ "sin_cos_embedding_keys": null,
1245
+ "mean_std_embedding_keys": null,
1246
+ "min_max_embedding_keys": null,
1247
+ "pass_through_keys": null,
1248
+ "action_configs": null
1249
+ },
1250
+ "state": {
1251
+ "delta_indices": [
1252
+ 0
1253
+ ],
1254
+ "modality_keys": [
1255
+ "psm1_pose",
1256
+ "psm1_gripper",
1257
+ "psm2_pose",
1258
+ "psm2_gripper"
1259
+ ],
1260
+ "sin_cos_embedding_keys": null,
1261
+ "mean_std_embedding_keys": [
1262
+ "psm1_pose",
1263
+ "psm1_gripper",
1264
+ "psm2_pose",
1265
+ "psm2_gripper"
1266
+ ],
1267
+ "min_max_embedding_keys": null,
1268
+ "pass_through_keys": null,
1269
+ "action_configs": null
1270
+ },
1271
+ "action": {
1272
+ "delta_indices": [
1273
+ 0,
1274
+ 1,
1275
+ 2,
1276
+ 3,
1277
+ 4,
1278
+ 5,
1279
+ 6,
1280
+ 7,
1281
+ 8,
1282
+ 9,
1283
+ 10,
1284
+ 11,
1285
+ 12,
1286
+ 13,
1287
+ 14,
1288
+ 15,
1289
+ 16,
1290
+ 17,
1291
+ 18,
1292
+ 19,
1293
+ 20,
1294
+ 21,
1295
+ 22,
1296
+ 23,
1297
+ 24,
1298
+ 25,
1299
+ 26,
1300
+ 27,
1301
+ 28,
1302
+ 29,
1303
+ 30,
1304
+ 31,
1305
+ 32,
1306
+ 33,
1307
+ 34,
1308
+ 35,
1309
+ 36,
1310
+ 37,
1311
+ 38,
1312
+ 39,
1313
+ 40,
1314
+ 41,
1315
+ 42,
1316
+ 43,
1317
+ 44,
1318
+ 45,
1319
+ 46,
1320
+ 47,
1321
+ 48,
1322
+ 49
1323
+ ],
1324
+ "modality_keys": [
1325
+ "psm1_pose",
1326
+ "psm1_gripper",
1327
+ "psm2_pose",
1328
+ "psm2_gripper"
1329
+ ],
1330
+ "sin_cos_embedding_keys": null,
1331
+ "mean_std_embedding_keys": null,
1332
+ "min_max_embedding_keys": null,
1333
+ "pass_through_keys": null,
1334
+ "action_configs": [
1335
+ {
1336
+ "rep": "REL_XYZ_ROT6D",
1337
+ "type": "EEF",
1338
+ "format": "XYZ_ROT6D",
1339
+ "state_key": "psm1_pose",
1340
+ "normalization_type": "percentile",
1341
+ "input_rotation_format": "euler",
1342
+ "input_quat_order": "xyzw",
1343
+ "reference_rotation_format": "euler",
1344
+ "reference_quat_order": "xyzw",
1345
+ "translation_scaling_key": null,
1346
+ "rotation_scaling_key": null,
1347
+ "hold_through_clutch": false
1348
+ },
1349
+ {
1350
+ "rep": "ABSOLUTE",
1351
+ "type": "NON_EEF",
1352
+ "format": "DEFAULT",
1353
+ "state_key": null,
1354
+ "normalization_type": "percentile",
1355
+ "input_rotation_format": "quat",
1356
+ "input_quat_order": "xyzw",
1357
+ "reference_rotation_format": "rot6d",
1358
+ "reference_quat_order": "xyzw",
1359
+ "translation_scaling_key": null,
1360
+ "rotation_scaling_key": null,
1361
+ "hold_through_clutch": false
1362
+ },
1363
+ {
1364
+ "rep": "REL_XYZ_ROT6D",
1365
+ "type": "EEF",
1366
+ "format": "XYZ_ROT6D",
1367
+ "state_key": "psm2_pose",
1368
+ "normalization_type": "percentile",
1369
+ "input_rotation_format": "euler",
1370
+ "input_quat_order": "xyzw",
1371
+ "reference_rotation_format": "euler",
1372
+ "reference_quat_order": "xyzw",
1373
+ "translation_scaling_key": null,
1374
+ "rotation_scaling_key": null,
1375
+ "hold_through_clutch": false
1376
+ },
1377
+ {
1378
+ "rep": "ABSOLUTE",
1379
+ "type": "NON_EEF",
1380
+ "format": "DEFAULT",
1381
+ "state_key": null,
1382
+ "normalization_type": "percentile",
1383
+ "input_rotation_format": "quat",
1384
+ "input_quat_order": "xyzw",
1385
+ "reference_rotation_format": "rot6d",
1386
+ "reference_quat_order": "xyzw",
1387
+ "translation_scaling_key": null,
1388
+ "rotation_scaling_key": null,
1389
+ "hold_through_clutch": false
1390
+ }
1391
+ ]
1392
+ },
1393
+ "language": {
1394
+ "delta_indices": [
1395
+ 0
1396
+ ],
1397
+ "modality_keys": [
1398
+ "task"
1399
+ ],
1400
+ "sin_cos_embedding_keys": null,
1401
+ "mean_std_embedding_keys": null,
1402
+ "min_max_embedding_keys": null,
1403
+ "pass_through_keys": null,
1404
+ "action_configs": null
1405
+ }
1406
+ },
1407
+ "tud_tundra_ur5e": {
1408
+ "video": {
1409
+ "delta_indices": [
1410
+ 0
1411
+ ],
1412
+ "modality_keys": [
1413
+ "laparoscope_left"
1414
+ ],
1415
+ "sin_cos_embedding_keys": null,
1416
+ "mean_std_embedding_keys": null,
1417
+ "min_max_embedding_keys": null,
1418
+ "pass_through_keys": null,
1419
+ "action_configs": null
1420
+ },
1421
+ "state": {
1422
+ "delta_indices": [
1423
+ 0
1424
+ ],
1425
+ "modality_keys": [
1426
+ "joint_position",
1427
+ "eef_pose"
1428
+ ],
1429
+ "sin_cos_embedding_keys": null,
1430
+ "mean_std_embedding_keys": [
1431
+ "joint_position"
1432
+ ],
1433
+ "min_max_embedding_keys": null,
1434
+ "pass_through_keys": [
1435
+ "eef_pose"
1436
+ ],
1437
+ "action_configs": null
1438
+ },
1439
+ "action": {
1440
+ "delta_indices": [
1441
+ 1,
1442
+ 2,
1443
+ 3,
1444
+ 4,
1445
+ 5,
1446
+ 6,
1447
+ 7,
1448
+ 8,
1449
+ 9,
1450
+ 10,
1451
+ 11,
1452
+ 12,
1453
+ 13,
1454
+ 14,
1455
+ 15,
1456
+ 16,
1457
+ 17,
1458
+ 18,
1459
+ 19,
1460
+ 20,
1461
+ 21,
1462
+ 22,
1463
+ 23,
1464
+ 24,
1465
+ 25,
1466
+ 26,
1467
+ 27,
1468
+ 28,
1469
+ 29,
1470
+ 30,
1471
+ 31,
1472
+ 32,
1473
+ 33,
1474
+ 34,
1475
+ 35,
1476
+ 36,
1477
+ 37,
1478
+ 38,
1479
+ 39,
1480
+ 40,
1481
+ 41,
1482
+ 42,
1483
+ 43,
1484
+ 44,
1485
+ 45,
1486
+ 46,
1487
+ 47,
1488
+ 48,
1489
+ 49,
1490
+ 50
1491
+ ],
1492
+ "modality_keys": [
1493
+ "eef_pose",
1494
+ "gripper"
1495
+ ],
1496
+ "sin_cos_embedding_keys": null,
1497
+ "mean_std_embedding_keys": null,
1498
+ "min_max_embedding_keys": null,
1499
+ "pass_through_keys": null,
1500
+ "action_configs": [
1501
+ {
1502
+ "rep": "REL_XYZ_ROT6D",
1503
+ "type": "EEF",
1504
+ "format": "XYZ_ROT6D",
1505
+ "state_key": "eef_pose",
1506
+ "normalization_type": "percentile",
1507
+ "input_rotation_format": "quat",
1508
+ "input_quat_order": "xyzw",
1509
+ "reference_rotation_format": "quat",
1510
+ "reference_quat_order": "xyzw",
1511
+ "translation_scaling_key": null,
1512
+ "rotation_scaling_key": null,
1513
+ "hold_through_clutch": false
1514
+ },
1515
+ {
1516
+ "rep": "ABSOLUTE",
1517
+ "type": "NON_EEF",
1518
+ "format": "DEFAULT",
1519
+ "state_key": null,
1520
+ "normalization_type": "percentile",
1521
+ "input_rotation_format": "quat",
1522
+ "input_quat_order": "xyzw",
1523
+ "reference_rotation_format": "rot6d",
1524
+ "reference_quat_order": "xyzw",
1525
+ "translation_scaling_key": null,
1526
+ "rotation_scaling_key": null,
1527
+ "hold_through_clutch": false
1528
+ }
1529
+ ]
1530
+ },
1531
+ "language": {
1532
+ "delta_indices": [
1533
+ 0
1534
+ ],
1535
+ "modality_keys": [
1536
+ "task"
1537
+ ],
1538
+ "sin_cos_embedding_keys": null,
1539
+ "mean_std_embedding_keys": null,
1540
+ "min_max_embedding_keys": null,
1541
+ "pass_through_keys": null,
1542
+ "action_configs": null
1543
+ }
1544
+ },
1545
+ "jhu_lscr_dvrk_smarts": {
1546
+ "video": {
1547
+ "delta_indices": [
1548
+ 0
1549
+ ],
1550
+ "modality_keys": [
1551
+ "endoscope_left",
1552
+ "camera_side_view"
1553
+ ],
1554
+ "sin_cos_embedding_keys": null,
1555
+ "mean_std_embedding_keys": null,
1556
+ "min_max_embedding_keys": null,
1557
+ "pass_through_keys": null,
1558
+ "action_configs": null
1559
+ },
1560
+ "state": {
1561
+ "delta_indices": [
1562
+ 0
1563
+ ],
1564
+ "modality_keys": [
1565
+ "psm1_pose",
1566
+ "psm1_gripper",
1567
+ "psm2_pose",
1568
+ "psm2_gripper"
1569
+ ],
1570
+ "sin_cos_embedding_keys": null,
1571
+ "mean_std_embedding_keys": [
1572
+ "psm1_pose",
1573
+ "psm1_gripper",
1574
+ "psm2_pose",
1575
+ "psm2_gripper"
1576
+ ],
1577
+ "min_max_embedding_keys": null,
1578
+ "pass_through_keys": null,
1579
+ "action_configs": null
1580
+ },
1581
+ "action": {
1582
+ "delta_indices": [
1583
+ 1,
1584
+ 2,
1585
+ 3,
1586
+ 4,
1587
+ 5,
1588
+ 6,
1589
+ 7,
1590
+ 8,
1591
+ 9,
1592
+ 10,
1593
+ 11,
1594
+ 12,
1595
+ 13,
1596
+ 14,
1597
+ 15,
1598
+ 16
1599
+ ],
1600
+ "modality_keys": [
1601
+ "psm1_pose",
1602
+ "psm1_gripper",
1603
+ "psm2_pose",
1604
+ "psm2_gripper"
1605
+ ],
1606
+ "sin_cos_embedding_keys": null,
1607
+ "mean_std_embedding_keys": null,
1608
+ "min_max_embedding_keys": null,
1609
+ "pass_through_keys": null,
1610
+ "action_configs": [
1611
+ {
1612
+ "rep": "REL_XYZ_ROT6D",
1613
+ "type": "EEF",
1614
+ "format": "XYZ_ROT6D",
1615
+ "state_key": "psm1_pose",
1616
+ "normalization_type": "percentile",
1617
+ "input_rotation_format": "quat",
1618
+ "input_quat_order": "xyzw",
1619
+ "reference_rotation_format": "quat",
1620
+ "reference_quat_order": "xyzw",
1621
+ "translation_scaling_key": null,
1622
+ "rotation_scaling_key": null,
1623
+ "hold_through_clutch": false
1624
+ },
1625
+ {
1626
+ "rep": "ABSOLUTE",
1627
+ "type": "NON_EEF",
1628
+ "format": "DEFAULT",
1629
+ "state_key": null,
1630
+ "normalization_type": "percentile",
1631
+ "input_rotation_format": "quat",
1632
+ "input_quat_order": "xyzw",
1633
+ "reference_rotation_format": "rot6d",
1634
+ "reference_quat_order": "xyzw",
1635
+ "translation_scaling_key": null,
1636
+ "rotation_scaling_key": null,
1637
+ "hold_through_clutch": false
1638
+ },
1639
+ {
1640
+ "rep": "REL_XYZ_ROT6D",
1641
+ "type": "EEF",
1642
+ "format": "XYZ_ROT6D",
1643
+ "state_key": "psm2_pose",
1644
+ "normalization_type": "percentile",
1645
+ "input_rotation_format": "quat",
1646
+ "input_quat_order": "xyzw",
1647
+ "reference_rotation_format": "quat",
1648
+ "reference_quat_order": "xyzw",
1649
+ "translation_scaling_key": null,
1650
+ "rotation_scaling_key": null,
1651
+ "hold_through_clutch": false
1652
+ },
1653
+ {
1654
+ "rep": "ABSOLUTE",
1655
+ "type": "NON_EEF",
1656
+ "format": "DEFAULT",
1657
+ "state_key": null,
1658
+ "normalization_type": "percentile",
1659
+ "input_rotation_format": "quat",
1660
+ "input_quat_order": "xyzw",
1661
+ "reference_rotation_format": "rot6d",
1662
+ "reference_quat_order": "xyzw",
1663
+ "translation_scaling_key": null,
1664
+ "rotation_scaling_key": null,
1665
+ "hold_through_clutch": false
1666
+ }
1667
+ ]
1668
+ },
1669
+ "language": {
1670
+ "delta_indices": [
1671
+ 0
1672
+ ],
1673
+ "modality_keys": [
1674
+ "annotation.task"
1675
+ ],
1676
+ "sin_cos_embedding_keys": null,
1677
+ "mean_std_embedding_keys": null,
1678
+ "min_max_embedding_keys": null,
1679
+ "pass_through_keys": null,
1680
+ "action_configs": null
1681
+ }
1682
+ },
1683
+ "jhu_imerse_dvrk_mono": {
1684
+ "video": {
1685
+ "delta_indices": [
1686
+ 0
1687
+ ],
1688
+ "modality_keys": [
1689
+ "endoscope_left"
1690
+ ],
1691
+ "sin_cos_embedding_keys": null,
1692
+ "mean_std_embedding_keys": null,
1693
+ "min_max_embedding_keys": null,
1694
+ "pass_through_keys": null,
1695
+ "action_configs": null
1696
+ },
1697
+ "state": {
1698
+ "delta_indices": [
1699
+ 0
1700
+ ],
1701
+ "modality_keys": [
1702
+ "psm1_pose",
1703
+ "psm1_gripper",
1704
+ "psm2_pose",
1705
+ "psm2_gripper"
1706
+ ],
1707
+ "sin_cos_embedding_keys": null,
1708
+ "mean_std_embedding_keys": [
1709
+ "psm1_pose",
1710
+ "psm1_gripper",
1711
+ "psm2_pose",
1712
+ "psm2_gripper"
1713
+ ],
1714
+ "min_max_embedding_keys": null,
1715
+ "pass_through_keys": null,
1716
+ "action_configs": null
1717
+ },
1718
+ "action": {
1719
+ "delta_indices": [
1720
+ 0,
1721
+ 1,
1722
+ 2,
1723
+ 3,
1724
+ 4,
1725
+ 5,
1726
+ 6,
1727
+ 7,
1728
+ 8,
1729
+ 9,
1730
+ 10,
1731
+ 11,
1732
+ 12,
1733
+ 13,
1734
+ 14,
1735
+ 15,
1736
+ 16,
1737
+ 17,
1738
+ 18,
1739
+ 19,
1740
+ 20,
1741
+ 21,
1742
+ 22,
1743
+ 23,
1744
+ 24,
1745
+ 25,
1746
+ 26,
1747
+ 27,
1748
+ 28,
1749
+ 29,
1750
+ 30,
1751
+ 31,
1752
+ 32,
1753
+ 33,
1754
+ 34,
1755
+ 35,
1756
+ 36,
1757
+ 37,
1758
+ 38,
1759
+ 39,
1760
+ 40,
1761
+ 41,
1762
+ 42,
1763
+ 43,
1764
+ 44,
1765
+ 45,
1766
+ 46,
1767
+ 47,
1768
+ 48,
1769
+ 49
1770
+ ],
1771
+ "modality_keys": [
1772
+ "psm1_pose",
1773
+ "psm1_gripper",
1774
+ "psm2_pose",
1775
+ "psm2_gripper"
1776
+ ],
1777
+ "sin_cos_embedding_keys": null,
1778
+ "mean_std_embedding_keys": null,
1779
+ "min_max_embedding_keys": null,
1780
+ "pass_through_keys": null,
1781
+ "action_configs": [
1782
+ {
1783
+ "rep": "REL_XYZ_ROT6D",
1784
+ "type": "EEF",
1785
+ "format": "XYZ_ROT6D",
1786
+ "state_key": "psm1_pose",
1787
+ "normalization_type": "percentile",
1788
+ "input_rotation_format": "quat",
1789
+ "input_quat_order": "xyzw",
1790
+ "reference_rotation_format": "quat",
1791
+ "reference_quat_order": "xyzw",
1792
+ "translation_scaling_key": null,
1793
+ "rotation_scaling_key": null,
1794
+ "hold_through_clutch": false
1795
+ },
1796
+ {
1797
+ "rep": "ABSOLUTE",
1798
+ "type": "NON_EEF",
1799
+ "format": "DEFAULT",
1800
+ "state_key": null,
1801
+ "normalization_type": "percentile",
1802
+ "input_rotation_format": "quat",
1803
+ "input_quat_order": "xyzw",
1804
+ "reference_rotation_format": "rot6d",
1805
+ "reference_quat_order": "xyzw",
1806
+ "translation_scaling_key": null,
1807
+ "rotation_scaling_key": null,
1808
+ "hold_through_clutch": false
1809
+ },
1810
+ {
1811
+ "rep": "REL_XYZ_ROT6D",
1812
+ "type": "EEF",
1813
+ "format": "XYZ_ROT6D",
1814
+ "state_key": "psm2_pose",
1815
+ "normalization_type": "percentile",
1816
+ "input_rotation_format": "quat",
1817
+ "input_quat_order": "xyzw",
1818
+ "reference_rotation_format": "quat",
1819
+ "reference_quat_order": "xyzw",
1820
+ "translation_scaling_key": null,
1821
+ "rotation_scaling_key": null,
1822
+ "hold_through_clutch": false
1823
+ },
1824
+ {
1825
+ "rep": "ABSOLUTE",
1826
+ "type": "NON_EEF",
1827
+ "format": "DEFAULT",
1828
+ "state_key": null,
1829
+ "normalization_type": "percentile",
1830
+ "input_rotation_format": "quat",
1831
+ "input_quat_order": "xyzw",
1832
+ "reference_rotation_format": "rot6d",
1833
+ "reference_quat_order": "xyzw",
1834
+ "translation_scaling_key": null,
1835
+ "rotation_scaling_key": null,
1836
+ "hold_through_clutch": false
1837
+ }
1838
+ ]
1839
+ },
1840
+ "language": {
1841
+ "delta_indices": [
1842
+ 0
1843
+ ],
1844
+ "modality_keys": [
1845
+ "annotation.human.task_description"
1846
+ ],
1847
+ "sin_cos_embedding_keys": null,
1848
+ "mean_std_embedding_keys": null,
1849
+ "min_max_embedding_keys": null,
1850
+ "pass_through_keys": null,
1851
+ "action_configs": null
1852
+ }
1853
+ },
1854
+ "rob_surgical_bitrack": {
1855
+ "video": {
1856
+ "delta_indices": [
1857
+ 0
1858
+ ],
1859
+ "modality_keys": [
1860
+ "endoscope"
1861
+ ],
1862
+ "sin_cos_embedding_keys": null,
1863
+ "mean_std_embedding_keys": null,
1864
+ "min_max_embedding_keys": null,
1865
+ "pass_through_keys": null,
1866
+ "action_configs": null
1867
+ },
1868
+ "state": {
1869
+ "delta_indices": [
1870
+ 0
1871
+ ],
1872
+ "modality_keys": [
1873
+ "left_pose",
1874
+ "right_pose",
1875
+ "aux_pose"
1876
+ ],
1877
+ "sin_cos_embedding_keys": null,
1878
+ "mean_std_embedding_keys": [
1879
+ "left_pose",
1880
+ "right_pose",
1881
+ "aux_pose"
1882
+ ],
1883
+ "min_max_embedding_keys": null,
1884
+ "pass_through_keys": null,
1885
+ "action_configs": null
1886
+ },
1887
+ "action": {
1888
+ "delta_indices": [
1889
+ 0,
1890
+ 1,
1891
+ 2,
1892
+ 3,
1893
+ 4,
1894
+ 5,
1895
+ 6,
1896
+ 7,
1897
+ 8,
1898
+ 9,
1899
+ 10,
1900
+ 11,
1901
+ 12,
1902
+ 13,
1903
+ 14,
1904
+ 15,
1905
+ 16,
1906
+ 17,
1907
+ 18,
1908
+ 19,
1909
+ 20,
1910
+ 21,
1911
+ 22,
1912
+ 23,
1913
+ 24,
1914
+ 25,
1915
+ 26,
1916
+ 27,
1917
+ 28,
1918
+ 29,
1919
+ 30,
1920
+ 31,
1921
+ 32,
1922
+ 33,
1923
+ 34,
1924
+ 35,
1925
+ 36,
1926
+ 37,
1927
+ 38,
1928
+ 39,
1929
+ 40,
1930
+ 41,
1931
+ 42,
1932
+ 43,
1933
+ 44,
1934
+ 45,
1935
+ 46,
1936
+ 47,
1937
+ 48,
1938
+ 49
1939
+ ],
1940
+ "modality_keys": [
1941
+ "left_pose",
1942
+ "right_pose",
1943
+ "aux_pose"
1944
+ ],
1945
+ "sin_cos_embedding_keys": null,
1946
+ "mean_std_embedding_keys": null,
1947
+ "min_max_embedding_keys": null,
1948
+ "pass_through_keys": null,
1949
+ "action_configs": [
1950
+ {
1951
+ "rep": "REL_XYZ_ROT6D",
1952
+ "type": "EEF",
1953
+ "format": "XYZ_ROT6D",
1954
+ "state_key": "left_pose",
1955
+ "normalization_type": "percentile",
1956
+ "input_rotation_format": "euler",
1957
+ "input_quat_order": "xyzw",
1958
+ "reference_rotation_format": "euler",
1959
+ "reference_quat_order": "xyzw",
1960
+ "translation_scaling_key": null,
1961
+ "rotation_scaling_key": null,
1962
+ "hold_through_clutch": false
1963
+ },
1964
+ {
1965
+ "rep": "REL_XYZ_ROT6D",
1966
+ "type": "EEF",
1967
+ "format": "XYZ_ROT6D",
1968
+ "state_key": "right_pose",
1969
+ "normalization_type": "percentile",
1970
+ "input_rotation_format": "euler",
1971
+ "input_quat_order": "xyzw",
1972
+ "reference_rotation_format": "euler",
1973
+ "reference_quat_order": "xyzw",
1974
+ "translation_scaling_key": null,
1975
+ "rotation_scaling_key": null,
1976
+ "hold_through_clutch": false
1977
+ },
1978
+ {
1979
+ "rep": "REL_XYZ_ROT6D",
1980
+ "type": "EEF",
1981
+ "format": "XYZ_ROT6D",
1982
+ "state_key": "aux_pose",
1983
+ "normalization_type": "percentile",
1984
+ "input_rotation_format": "euler",
1985
+ "input_quat_order": "xyzw",
1986
+ "reference_rotation_format": "euler",
1987
+ "reference_quat_order": "xyzw",
1988
+ "translation_scaling_key": null,
1989
+ "rotation_scaling_key": null,
1990
+ "hold_through_clutch": false
1991
+ }
1992
+ ]
1993
+ },
1994
+ "language": {
1995
+ "delta_indices": [
1996
+ 0
1997
+ ],
1998
+ "modality_keys": [
1999
+ "annotation.instruction"
2000
+ ],
2001
+ "sin_cos_embedding_keys": null,
2002
+ "mean_std_embedding_keys": null,
2003
+ "min_max_embedding_keys": null,
2004
+ "pass_through_keys": null,
2005
+ "action_configs": null
2006
+ }
2007
+ },
2008
+ "turin_mitic_ex_vivo": {
2009
+ "video": {
2010
+ "delta_indices": [
2011
+ 0
2012
+ ],
2013
+ "modality_keys": [
2014
+ "endoscope_left"
2015
+ ],
2016
+ "sin_cos_embedding_keys": null,
2017
+ "mean_std_embedding_keys": null,
2018
+ "min_max_embedding_keys": null,
2019
+ "pass_through_keys": null,
2020
+ "action_configs": null
2021
+ },
2022
+ "state": {
2023
+ "delta_indices": [
2024
+ 0
2025
+ ],
2026
+ "modality_keys": [
2027
+ "psm1_joints",
2028
+ "psm2_joints",
2029
+ "psm1_pose",
2030
+ "psm2_pose"
2031
+ ],
2032
+ "sin_cos_embedding_keys": null,
2033
+ "mean_std_embedding_keys": [
2034
+ "psm1_joints",
2035
+ "psm2_joints"
2036
+ ],
2037
+ "min_max_embedding_keys": null,
2038
+ "pass_through_keys": [
2039
+ "psm1_pose",
2040
+ "psm2_pose"
2041
+ ],
2042
+ "action_configs": null
2043
+ },
2044
+ "action": {
2045
+ "delta_indices": [
2046
+ 1,
2047
+ 2,
2048
+ 3,
2049
+ 4,
2050
+ 5,
2051
+ 6,
2052
+ 7,
2053
+ 8,
2054
+ 9,
2055
+ 10,
2056
+ 11,
2057
+ 12,
2058
+ 13,
2059
+ 14,
2060
+ 15,
2061
+ 16,
2062
+ 17,
2063
+ 18,
2064
+ 19,
2065
+ 20,
2066
+ 21,
2067
+ 22,
2068
+ 23,
2069
+ 24,
2070
+ 25,
2071
+ 26,
2072
+ 27,
2073
+ 28,
2074
+ 29,
2075
+ 30,
2076
+ 31,
2077
+ 32,
2078
+ 33,
2079
+ 34,
2080
+ 35,
2081
+ 36,
2082
+ 37,
2083
+ 38,
2084
+ 39,
2085
+ 40,
2086
+ 41,
2087
+ 42,
2088
+ 43,
2089
+ 44,
2090
+ 45,
2091
+ 46,
2092
+ 47,
2093
+ 48,
2094
+ 49,
2095
+ 50
2096
+ ],
2097
+ "modality_keys": [
2098
+ "psm1_pose",
2099
+ "psm2_pose"
2100
+ ],
2101
+ "sin_cos_embedding_keys": null,
2102
+ "mean_std_embedding_keys": null,
2103
+ "min_max_embedding_keys": null,
2104
+ "pass_through_keys": null,
2105
+ "action_configs": [
2106
+ {
2107
+ "rep": "REL_XYZ_ROT6D",
2108
+ "type": "EEF",
2109
+ "format": "XYZ_ROT6D",
2110
+ "state_key": "psm1_pose",
2111
+ "normalization_type": "percentile",
2112
+ "input_rotation_format": "quat",
2113
+ "input_quat_order": "xyzw",
2114
+ "reference_rotation_format": "quat",
2115
+ "reference_quat_order": "xyzw",
2116
+ "translation_scaling_key": null,
2117
+ "rotation_scaling_key": null,
2118
+ "hold_through_clutch": false
2119
+ },
2120
+ {
2121
+ "rep": "REL_XYZ_ROT6D",
2122
+ "type": "EEF",
2123
+ "format": "XYZ_ROT6D",
2124
+ "state_key": "psm2_pose",
2125
+ "normalization_type": "percentile",
2126
+ "input_rotation_format": "quat",
2127
+ "input_quat_order": "xyzw",
2128
+ "reference_rotation_format": "quat",
2129
+ "reference_quat_order": "xyzw",
2130
+ "translation_scaling_key": null,
2131
+ "rotation_scaling_key": null,
2132
+ "hold_through_clutch": false
2133
+ }
2134
+ ]
2135
+ },
2136
+ "language": {
2137
+ "delta_indices": [
2138
+ 0
2139
+ ],
2140
+ "modality_keys": [
2141
+ "annotation.instruction"
2142
+ ],
2143
+ "sin_cos_embedding_keys": null,
2144
+ "mean_std_embedding_keys": null,
2145
+ "min_max_embedding_keys": null,
2146
+ "pass_through_keys": null,
2147
+ "action_configs": null
2148
+ }
2149
+ },
2150
+ "ustc_torin_tuodao": {
2151
+ "video": {
2152
+ "delta_indices": [
2153
+ 0
2154
+ ],
2155
+ "modality_keys": [
2156
+ "endoscope_left"
2157
+ ],
2158
+ "sin_cos_embedding_keys": null,
2159
+ "mean_std_embedding_keys": null,
2160
+ "min_max_embedding_keys": null,
2161
+ "pass_through_keys": null,
2162
+ "action_configs": null
2163
+ },
2164
+ "state": {
2165
+ "delta_indices": [
2166
+ 0
2167
+ ],
2168
+ "modality_keys": [
2169
+ "left_joints",
2170
+ "right_joints",
2171
+ "left_pose",
2172
+ "right_pose"
2173
+ ],
2174
+ "sin_cos_embedding_keys": null,
2175
+ "mean_std_embedding_keys": [
2176
+ "left_joints",
2177
+ "right_joints"
2178
+ ],
2179
+ "min_max_embedding_keys": null,
2180
+ "pass_through_keys": [
2181
+ "left_pose",
2182
+ "right_pose"
2183
+ ],
2184
+ "action_configs": null
2185
+ },
2186
+ "action": {
2187
+ "delta_indices": [
2188
+ 0,
2189
+ 1,
2190
+ 2,
2191
+ 3,
2192
+ 4,
2193
+ 5,
2194
+ 6,
2195
+ 7,
2196
+ 8,
2197
+ 9,
2198
+ 10,
2199
+ 11,
2200
+ 12,
2201
+ 13,
2202
+ 14,
2203
+ 15,
2204
+ 16,
2205
+ 17,
2206
+ 18,
2207
+ 19,
2208
+ 20,
2209
+ 21,
2210
+ 22,
2211
+ 23,
2212
+ 24,
2213
+ 25,
2214
+ 26,
2215
+ 27,
2216
+ 28,
2217
+ 29,
2218
+ 30,
2219
+ 31,
2220
+ 32,
2221
+ 33,
2222
+ 34,
2223
+ 35,
2224
+ 36,
2225
+ 37,
2226
+ 38,
2227
+ 39,
2228
+ 40,
2229
+ 41,
2230
+ 42,
2231
+ 43,
2232
+ 44,
2233
+ 45,
2234
+ 46,
2235
+ 47,
2236
+ 48,
2237
+ 49
2238
+ ],
2239
+ "modality_keys": [
2240
+ "left_pose",
2241
+ "left_gripper",
2242
+ "right_pose",
2243
+ "right_gripper"
2244
+ ],
2245
+ "sin_cos_embedding_keys": null,
2246
+ "mean_std_embedding_keys": null,
2247
+ "min_max_embedding_keys": null,
2248
+ "pass_through_keys": null,
2249
+ "action_configs": [
2250
+ {
2251
+ "rep": "REL_XYZ_ROT6D",
2252
+ "type": "EEF",
2253
+ "format": "XYZ_ROT6D",
2254
+ "state_key": "left_pose",
2255
+ "normalization_type": "percentile",
2256
+ "input_rotation_format": "quat",
2257
+ "input_quat_order": "xyzw",
2258
+ "reference_rotation_format": "quat",
2259
+ "reference_quat_order": "xyzw",
2260
+ "translation_scaling_key": null,
2261
+ "rotation_scaling_key": null,
2262
+ "hold_through_clutch": false
2263
+ },
2264
+ {
2265
+ "rep": "ABSOLUTE",
2266
+ "type": "NON_EEF",
2267
+ "format": "DEFAULT",
2268
+ "state_key": null,
2269
+ "normalization_type": "percentile",
2270
+ "input_rotation_format": "quat",
2271
+ "input_quat_order": "xyzw",
2272
+ "reference_rotation_format": "rot6d",
2273
+ "reference_quat_order": "xyzw",
2274
+ "translation_scaling_key": null,
2275
+ "rotation_scaling_key": null,
2276
+ "hold_through_clutch": false
2277
+ },
2278
+ {
2279
+ "rep": "REL_XYZ_ROT6D",
2280
+ "type": "EEF",
2281
+ "format": "XYZ_ROT6D",
2282
+ "state_key": "right_pose",
2283
+ "normalization_type": "percentile",
2284
+ "input_rotation_format": "quat",
2285
+ "input_quat_order": "xyzw",
2286
+ "reference_rotation_format": "quat",
2287
+ "reference_quat_order": "xyzw",
2288
+ "translation_scaling_key": null,
2289
+ "rotation_scaling_key": null,
2290
+ "hold_through_clutch": false
2291
+ },
2292
+ {
2293
+ "rep": "ABSOLUTE",
2294
+ "type": "NON_EEF",
2295
+ "format": "DEFAULT",
2296
+ "state_key": null,
2297
+ "normalization_type": "percentile",
2298
+ "input_rotation_format": "quat",
2299
+ "input_quat_order": "xyzw",
2300
+ "reference_rotation_format": "rot6d",
2301
+ "reference_quat_order": "xyzw",
2302
+ "translation_scaling_key": null,
2303
+ "rotation_scaling_key": null,
2304
+ "hold_through_clutch": false
2305
+ }
2306
+ ]
2307
+ },
2308
+ "language": {
2309
+ "delta_indices": [
2310
+ 0
2311
+ ],
2312
+ "modality_keys": [
2313
+ "annotation.instruction"
2314
+ ],
2315
+ "sin_cos_embedding_keys": null,
2316
+ "mean_std_embedding_keys": null,
2317
+ "min_max_embedding_keys": null,
2318
+ "pass_through_keys": null,
2319
+ "action_configs": null
2320
+ }
2321
+ },
2322
+ "hamlyn_dvrk_30hz": {
2323
+ "video": {
2324
+ "delta_indices": [
2325
+ 0
2326
+ ],
2327
+ "modality_keys": [
2328
+ "endoscope",
2329
+ "wrist_left",
2330
+ "wrist_right"
2331
+ ],
2332
+ "sin_cos_embedding_keys": null,
2333
+ "mean_std_embedding_keys": null,
2334
+ "min_max_embedding_keys": null,
2335
+ "pass_through_keys": null,
2336
+ "action_configs": null
2337
+ },
2338
+ "state": {
2339
+ "delta_indices": [
2340
+ 0
2341
+ ],
2342
+ "modality_keys": [
2343
+ "left_arm_pose",
2344
+ "left_arm_gripper",
2345
+ "right_arm_pose",
2346
+ "right_arm_gripper"
2347
+ ],
2348
+ "sin_cos_embedding_keys": null,
2349
+ "mean_std_embedding_keys": [
2350
+ "left_arm_pose",
2351
+ "left_arm_gripper",
2352
+ "right_arm_pose",
2353
+ "right_arm_gripper"
2354
+ ],
2355
+ "min_max_embedding_keys": null,
2356
+ "pass_through_keys": null,
2357
+ "action_configs": null
2358
+ },
2359
+ "action": {
2360
+ "delta_indices": [
2361
+ 0,
2362
+ 1,
2363
+ 2,
2364
+ 3,
2365
+ 4,
2366
+ 5,
2367
+ 6,
2368
+ 7,
2369
+ 8,
2370
+ 9,
2371
+ 10,
2372
+ 11,
2373
+ 12,
2374
+ 13,
2375
+ 14,
2376
+ 15,
2377
+ 16,
2378
+ 17,
2379
+ 18,
2380
+ 19,
2381
+ 20,
2382
+ 21,
2383
+ 22,
2384
+ 23,
2385
+ 24,
2386
+ 25,
2387
+ 26,
2388
+ 27,
2389
+ 28,
2390
+ 29,
2391
+ 30,
2392
+ 31,
2393
+ 32,
2394
+ 33,
2395
+ 34,
2396
+ 35,
2397
+ 36,
2398
+ 37,
2399
+ 38,
2400
+ 39,
2401
+ 40,
2402
+ 41,
2403
+ 42,
2404
+ 43,
2405
+ 44,
2406
+ 45,
2407
+ 46,
2408
+ 47,
2409
+ 48,
2410
+ 49
2411
+ ],
2412
+ "modality_keys": [
2413
+ "left_arm_pose",
2414
+ "left_arm_gripper",
2415
+ "right_arm_pose",
2416
+ "right_arm_gripper"
2417
+ ],
2418
+ "sin_cos_embedding_keys": null,
2419
+ "mean_std_embedding_keys": null,
2420
+ "min_max_embedding_keys": null,
2421
+ "pass_through_keys": null,
2422
+ "action_configs": [
2423
+ {
2424
+ "rep": "REL_XYZ_ROT6D",
2425
+ "type": "EEF",
2426
+ "format": "XYZ_ROT6D",
2427
+ "state_key": "left_arm_pose",
2428
+ "normalization_type": "percentile",
2429
+ "input_rotation_format": "quat",
2430
+ "input_quat_order": "wxyz",
2431
+ "reference_rotation_format": "quat",
2432
+ "reference_quat_order": "wxyz",
2433
+ "translation_scaling_key": null,
2434
+ "rotation_scaling_key": null,
2435
+ "hold_through_clutch": false
2436
+ },
2437
+ {
2438
+ "rep": "ABSOLUTE",
2439
+ "type": "NON_EEF",
2440
+ "format": "DEFAULT",
2441
+ "state_key": null,
2442
+ "normalization_type": "percentile",
2443
+ "input_rotation_format": "quat",
2444
+ "input_quat_order": "xyzw",
2445
+ "reference_rotation_format": "rot6d",
2446
+ "reference_quat_order": "xyzw",
2447
+ "translation_scaling_key": null,
2448
+ "rotation_scaling_key": null,
2449
+ "hold_through_clutch": false
2450
+ },
2451
+ {
2452
+ "rep": "REL_XYZ_ROT6D",
2453
+ "type": "EEF",
2454
+ "format": "XYZ_ROT6D",
2455
+ "state_key": "right_arm_pose",
2456
+ "normalization_type": "percentile",
2457
+ "input_rotation_format": "quat",
2458
+ "input_quat_order": "wxyz",
2459
+ "reference_rotation_format": "quat",
2460
+ "reference_quat_order": "wxyz",
2461
+ "translation_scaling_key": null,
2462
+ "rotation_scaling_key": null,
2463
+ "hold_through_clutch": false
2464
+ },
2465
+ {
2466
+ "rep": "ABSOLUTE",
2467
+ "type": "NON_EEF",
2468
+ "format": "DEFAULT",
2469
+ "state_key": null,
2470
+ "normalization_type": "percentile",
2471
+ "input_rotation_format": "quat",
2472
+ "input_quat_order": "xyzw",
2473
+ "reference_rotation_format": "rot6d",
2474
+ "reference_quat_order": "xyzw",
2475
+ "translation_scaling_key": null,
2476
+ "rotation_scaling_key": null,
2477
+ "hold_through_clutch": false
2478
+ }
2479
+ ]
2480
+ },
2481
+ "language": {
2482
+ "delta_indices": [
2483
+ 0
2484
+ ],
2485
+ "modality_keys": [
2486
+ "task"
2487
+ ],
2488
+ "sin_cos_embedding_keys": null,
2489
+ "mean_std_embedding_keys": null,
2490
+ "min_max_embedding_keys": null,
2491
+ "pass_through_keys": null,
2492
+ "action_configs": null
2493
+ }
2494
+ },
2495
+ "ucb_dvrk": {
2496
+ "video": {
2497
+ "delta_indices": [
2498
+ 0
2499
+ ],
2500
+ "modality_keys": [
2501
+ "camera_left"
2502
+ ],
2503
+ "sin_cos_embedding_keys": null,
2504
+ "mean_std_embedding_keys": null,
2505
+ "min_max_embedding_keys": null,
2506
+ "pass_through_keys": null,
2507
+ "action_configs": null
2508
+ },
2509
+ "state": {
2510
+ "delta_indices": [
2511
+ 0
2512
+ ],
2513
+ "modality_keys": [
2514
+ "psm1_joints",
2515
+ "psm1_gripper",
2516
+ "psm2_joints",
2517
+ "psm2_gripper",
2518
+ "psm1_pose",
2519
+ "psm2_pose"
2520
+ ],
2521
+ "sin_cos_embedding_keys": null,
2522
+ "mean_std_embedding_keys": [
2523
+ "psm1_joints",
2524
+ "psm1_gripper",
2525
+ "psm2_joints",
2526
+ "psm2_gripper"
2527
+ ],
2528
+ "min_max_embedding_keys": null,
2529
+ "pass_through_keys": [
2530
+ "psm1_pose",
2531
+ "psm2_pose"
2532
+ ],
2533
+ "action_configs": null
2534
+ },
2535
+ "action": {
2536
+ "delta_indices": [
2537
+ 0,
2538
+ 1,
2539
+ 2,
2540
+ 3,
2541
+ 4,
2542
+ 5,
2543
+ 6,
2544
+ 7,
2545
+ 8,
2546
+ 9,
2547
+ 10,
2548
+ 11,
2549
+ 12,
2550
+ 13,
2551
+ 14,
2552
+ 15,
2553
+ 16,
2554
+ 17,
2555
+ 18,
2556
+ 19,
2557
+ 20,
2558
+ 21,
2559
+ 22,
2560
+ 23,
2561
+ 24,
2562
+ 25,
2563
+ 26,
2564
+ 27,
2565
+ 28,
2566
+ 29,
2567
+ 30,
2568
+ 31,
2569
+ 32,
2570
+ 33,
2571
+ 34,
2572
+ 35,
2573
+ 36,
2574
+ 37,
2575
+ 38,
2576
+ 39,
2577
+ 40,
2578
+ 41,
2579
+ 42,
2580
+ 43,
2581
+ 44,
2582
+ 45,
2583
+ 46,
2584
+ 47,
2585
+ 48,
2586
+ 49
2587
+ ],
2588
+ "modality_keys": [
2589
+ "psm1_pose",
2590
+ "psm1_gripper",
2591
+ "psm2_pose",
2592
+ "psm2_gripper"
2593
+ ],
2594
+ "sin_cos_embedding_keys": null,
2595
+ "mean_std_embedding_keys": null,
2596
+ "min_max_embedding_keys": null,
2597
+ "pass_through_keys": null,
2598
+ "action_configs": [
2599
+ {
2600
+ "rep": "REL_XYZ_ROT6D",
2601
+ "type": "EEF",
2602
+ "format": "XYZ_ROT6D",
2603
+ "state_key": "psm1_pose",
2604
+ "normalization_type": "percentile",
2605
+ "input_rotation_format": "quat",
2606
+ "input_quat_order": "xyzw",
2607
+ "reference_rotation_format": "quat",
2608
+ "reference_quat_order": "xyzw",
2609
+ "translation_scaling_key": null,
2610
+ "rotation_scaling_key": null,
2611
+ "hold_through_clutch": false
2612
+ },
2613
+ {
2614
+ "rep": "ABSOLUTE",
2615
+ "type": "NON_EEF",
2616
+ "format": "DEFAULT",
2617
+ "state_key": null,
2618
+ "normalization_type": "percentile",
2619
+ "input_rotation_format": "quat",
2620
+ "input_quat_order": "xyzw",
2621
+ "reference_rotation_format": "rot6d",
2622
+ "reference_quat_order": "xyzw",
2623
+ "translation_scaling_key": null,
2624
+ "rotation_scaling_key": null,
2625
+ "hold_through_clutch": false
2626
+ },
2627
+ {
2628
+ "rep": "REL_XYZ_ROT6D",
2629
+ "type": "EEF",
2630
+ "format": "XYZ_ROT6D",
2631
+ "state_key": "psm2_pose",
2632
+ "normalization_type": "percentile",
2633
+ "input_rotation_format": "quat",
2634
+ "input_quat_order": "xyzw",
2635
+ "reference_rotation_format": "quat",
2636
+ "reference_quat_order": "xyzw",
2637
+ "translation_scaling_key": null,
2638
+ "rotation_scaling_key": null,
2639
+ "hold_through_clutch": false
2640
+ },
2641
+ {
2642
+ "rep": "ABSOLUTE",
2643
+ "type": "NON_EEF",
2644
+ "format": "DEFAULT",
2645
+ "state_key": null,
2646
+ "normalization_type": "percentile",
2647
+ "input_rotation_format": "quat",
2648
+ "input_quat_order": "xyzw",
2649
+ "reference_rotation_format": "rot6d",
2650
+ "reference_quat_order": "xyzw",
2651
+ "translation_scaling_key": null,
2652
+ "rotation_scaling_key": null,
2653
+ "hold_through_clutch": false
2654
+ }
2655
+ ]
2656
+ },
2657
+ "language": {
2658
+ "delta_indices": [
2659
+ 0
2660
+ ],
2661
+ "modality_keys": [
2662
+ "task"
2663
+ ],
2664
+ "sin_cos_embedding_keys": null,
2665
+ "mean_std_embedding_keys": null,
2666
+ "min_max_embedding_keys": null,
2667
+ "pass_through_keys": null,
2668
+ "action_configs": null
2669
+ }
2670
+ },
2671
+ "jhu_imerse_star_il": {
2672
+ "video": {
2673
+ "delta_indices": [
2674
+ 0
2675
+ ],
2676
+ "modality_keys": [
2677
+ "endoscope_left",
2678
+ "wrist_left"
2679
+ ],
2680
+ "sin_cos_embedding_keys": null,
2681
+ "mean_std_embedding_keys": null,
2682
+ "min_max_embedding_keys": null,
2683
+ "pass_through_keys": null,
2684
+ "action_configs": null
2685
+ },
2686
+ "state": {
2687
+ "delta_indices": [
2688
+ 0
2689
+ ],
2690
+ "modality_keys": [
2691
+ "kuka_joint_pos",
2692
+ "endo360_joint_pos",
2693
+ "kuka_pose"
2694
+ ],
2695
+ "sin_cos_embedding_keys": null,
2696
+ "mean_std_embedding_keys": [
2697
+ "kuka_joint_pos",
2698
+ "endo360_joint_pos"
2699
+ ],
2700
+ "min_max_embedding_keys": null,
2701
+ "pass_through_keys": [
2702
+ "kuka_pose"
2703
+ ],
2704
+ "action_configs": null
2705
+ },
2706
+ "action": {
2707
+ "delta_indices": [
2708
+ 1,
2709
+ 2,
2710
+ 3,
2711
+ 4,
2712
+ 5,
2713
+ 6,
2714
+ 7,
2715
+ 8,
2716
+ 9,
2717
+ 10,
2718
+ 11,
2719
+ 12,
2720
+ 13,
2721
+ 14,
2722
+ 15,
2723
+ 16,
2724
+ 17,
2725
+ 18,
2726
+ 19,
2727
+ 20,
2728
+ 21,
2729
+ 22,
2730
+ 23,
2731
+ 24,
2732
+ 25,
2733
+ 26,
2734
+ 27,
2735
+ 28,
2736
+ 29,
2737
+ 30,
2738
+ 31,
2739
+ 32,
2740
+ 33,
2741
+ 34,
2742
+ 35,
2743
+ 36,
2744
+ 37,
2745
+ 38,
2746
+ 39,
2747
+ 40,
2748
+ 41,
2749
+ 42,
2750
+ 43,
2751
+ 44,
2752
+ 45,
2753
+ 46,
2754
+ 47,
2755
+ 48,
2756
+ 49,
2757
+ 50
2758
+ ],
2759
+ "modality_keys": [
2760
+ "kuka_pose"
2761
+ ],
2762
+ "sin_cos_embedding_keys": null,
2763
+ "mean_std_embedding_keys": null,
2764
+ "min_max_embedding_keys": null,
2765
+ "pass_through_keys": null,
2766
+ "action_configs": [
2767
+ {
2768
+ "rep": "REL_XYZ_ROT6D",
2769
+ "type": "EEF",
2770
+ "format": "XYZ_ROT6D",
2771
+ "state_key": "kuka_pose",
2772
+ "normalization_type": "percentile",
2773
+ "input_rotation_format": "quat",
2774
+ "input_quat_order": "xyzw",
2775
+ "reference_rotation_format": "quat",
2776
+ "reference_quat_order": "xyzw",
2777
+ "translation_scaling_key": null,
2778
+ "rotation_scaling_key": null,
2779
+ "hold_through_clutch": false
2780
+ }
2781
+ ]
2782
+ },
2783
+ "language": {
2784
+ "delta_indices": [
2785
+ 0
2786
+ ],
2787
+ "modality_keys": [
2788
+ "annotation.human.task_description"
2789
+ ],
2790
+ "sin_cos_embedding_keys": null,
2791
+ "mean_std_embedding_keys": null,
2792
+ "min_max_embedding_keys": null,
2793
+ "pass_through_keys": null,
2794
+ "action_configs": null
2795
+ }
2796
+ }
2797
+ },
2798
+ "image_crop_size": [
2799
+ 224,
2800
+ 392
2801
+ ],
2802
+ "image_target_size": [
2803
+ 236,
2804
+ 414
2805
+ ],
2806
+ "use_albumentations": false,
2807
+ "random_rotation_angle": 5,
2808
+ "color_jitter_params": {
2809
+ "brightness": 0.12,
2810
+ "contrast": 0.15,
2811
+ "saturation": 0.15,
2812
+ "hue": 0.02
2813
+ },
2814
+ "shortest_image_edge": null,
2815
+ "crop_fraction": null,
2816
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
2817
+ "model_type": "eagle",
2818
+ "formalize_language": true,
2819
+ "max_state_dim": 128,
2820
+ "max_action_dim": 128,
2821
+ "max_action_horizon": 50,
2822
+ "use_percentiles": false,
2823
+ "clip_outliers": true,
2824
+ "apply_sincos_state_encoding": true,
2825
+ "use_relative_action": true
2826
+ }
2827
+ }
statistics.json ADDED
The diff for this file is too large to render. See raw diff