Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

checkpoint-10000/config.json +65 -0
checkpoint-10000/experiment_cfg/data_config.py +1981 -0
checkpoint-10000/experiment_cfg/metadata.json +871 -0
checkpoint-10000/experiment_cfg/train_config.yaml +38 -0
checkpoint-10000/model-00001-of-00002.safetensors +3 -0
checkpoint-10000/model-00002-of-00002.safetensors +3 -0
checkpoint-10000/model.safetensors.index.json +0 -0
checkpoint-10000/optimizer.pt +3 -0
checkpoint-10000/rng_state_0.pth +3 -0
checkpoint-10000/rng_state_1.pth +3 -0
checkpoint-10000/scheduler.pt +3 -0
checkpoint-10000/trainer_state.json +0 -0
runs/Mar24_21-00-46_worker-node1000/events.out.tfevents.1774353663.worker-node1000.426899.0 +2 -2

checkpoint-10000/config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "action_dim": 66,
+  "action_head_cfg": {
+    "action_dim": 66,
+    "action_horizon": 40,
+    "add_pos_embed": true,
+    "backbone_embedding_dim": 2048,
+    "diffusion_model_cfg": {
+      "attention_head_dim": 48,
+      "cross_attention_dim": 2048,
+      "dropout": 0.2,
+      "final_dropout": true,
+      "interleave_self_attention": true,
+      "norm_type": "ada_norm",
+      "num_attention_heads": 32,
+      "num_layers": 16,
+      "output_dim": 1024,
+      "positional_embeddings": null
+    },
+    "hidden_size": 1024,
+    "input_embedding_dim": 1536,
+    "max_action_dim": 32,
+    "max_state_dim": 66,
+    "model_dtype": "float32",
+    "noise_beta_alpha": 1.5,
+    "noise_beta_beta": 1.0,
+    "noise_s": 0.999,
+    "num_inference_timesteps": 4,
+    "num_target_vision_tokens": 32,
+    "num_timestep_buckets": 1000,
+    "training_rtc_max_overlap": -1,
+    "tune_diffusion_model": true,
+    "tune_projector": true,
+    "use_vlln": true,
+    "vl_self_attention_cfg": {
+      "attention_head_dim": 64,
+      "dropout": 0.2,
+      "final_dropout": true,
+      "num_attention_heads": 32,
+      "num_layers": 4,
+      "positional_embeddings": null
+    }
+  },
+  "action_horizon": 40,
+  "architectures": [
+    "GR00T_N1_5"
+  ],
+  "attn_implementation": null,
+  "backbone_cfg": {
+    "eagle_path": "NVEagle/eagle_er-qwen3_1_7B-Siglip2_400M_stage1_5_128gpu_er_v7_1mlp_nops",
+    "load_bf16": false,
+    "project_to_dim": null,
+    "reproject_vision": false,
+    "select_layer": 12,
+    "tune_llm": false,
+    "tune_visual": true,
+    "use_flash_attention": true
+  },
+  "compute_dtype": "bfloat16",
+  "hidden_size": 2048,
+  "model_dtype": "float32",
+  "model_type": "gr00t_n1_5",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3"
+}

checkpoint-10000/experiment_cfg/data_config.py ADDED Viewed

	@@ -0,0 +1,1981 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from gr00t.data.dataset import ModalityConfig
+from gr00t.data.transform.base import ComposedModalityTransform, ModalityTransform
+from gr00t.data.transform.concat import ConcatTransform
+from gr00t.data.transform.state_action import (
+    StateActionSinCosTransform,
+    StateActionToTensor,
+    StateActionTransform,
+)
+from gr00t.data.transform.video import (
+    VideoColorJitter,
+    VideoCrop,
+    VideoPerspective,
+    VideoResize,
+    VideoToNumpy,
+    VideoToTensor,
+)
+from gr00t.model.transforms import GR00TTransform
+class BaseDataConfig(ABC):
+    vlash_offset: int = 0
+    training_rtc_max_overlap: int = 0
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        return {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+    @abstractmethod
+    def transform(self) -> ModalityTransform:
+        pass
+#####################################################################################
+# helper functions
+#####################################################################################
+def import_external_data_config(data_config_str: str) -> Optional[BaseDataConfig]:
+    """
+    Import and instantiate an external data configuration class.
+    Format: "module_path:ClassName" (e.g., "my_configs:RobotConfig")
+    Supports nested modules like "package.submodule:ClassName"
+    """
+    if ":" not in data_config_str:
+        return None
+    import importlib
+    import os
+    import sys
+    from pathlib import Path
+    # Add current working directory to Python path
+    current_dir = str(Path(os.getcwd()).absolute())
+    if current_dir not in sys.path:
+        sys.path.insert(0, current_dir)
+    try:
+        module_path, class_name = data_config_str.split(":", 1)
+        if not module_path or not class_name:
+            raise ValueError(f"Invalid format: '{data_config_str}'. Use 'module:ClassName'")
+        print(f"Loading external config: {module_path}.{class_name}")
+        module = importlib.import_module(module_path)
+        if not hasattr(module, class_name):
+            available = [
+                n
+                for n in dir(module)
+                if not n.startswith("_") and isinstance(getattr(module, n), type)
+            ]
+            raise AttributeError(
+                f"Class '{class_name}' not found in '{module_path}'. Available: {available}"
+            )
+        # assert if the class has 'transform' and 'modality_config' methods
+        if not hasattr(getattr(module, class_name), "transform"):
+            raise AttributeError(f"Class '{class_name}' does not have a 'transform' method")
+        if not hasattr(getattr(module, class_name), "modality_config"):
+            raise AttributeError(f"Class '{class_name}' does not have a 'modality_config' method")
+        return getattr(module, class_name)()
+    except (ModuleNotFoundError, AttributeError, ValueError) as e:
+        print(f"Config loading failed: {e}")
+        print("Example: my_configs:MyConfig, package.submodule:ClassName")
+        raise
+def load_data_config(data_config_str: str) -> BaseDataConfig:
+    """
+    Get a data config class from a string.
+    >>> load_data_config("so100")
+    >>> get_data_config("dir.subdir.my_configs:RobotConfig")
+    """
+    if data_config_str in DATA_CONFIG_MAP:
+        return DATA_CONFIG_MAP[data_config_str]
+    data_config_cls = import_external_data_config(data_config_str)
+    if data_config_cls is not None:
+        return data_config_cls
+    # Yellow warning color
+    yellow = "\033[93m"
+    reset = "\033[0m"
+    raise ValueError(
+        f"{yellow}Invalid data_config '{data_config_str}'. "
+        f"Available options: {list(DATA_CONFIG_MAP.keys())}, "
+        f"or use 'module:ClassName' for external configs{reset}"
+    )
+###########################################################################################
+class FourierGr1ArmsOnlyDataConfig(BaseDataConfig):
+    video_keys = ["video.ego_view"]
+    state_keys = [
+        "state.left_arm",
+        "state.right_arm",
+        "state.left_hand",
+        "state.right_hand",
+    ]
+    action_keys = [
+        "action.left_arm",
+        "action.right_arm",
+        "action.left_hand",
+        "action.right_hand",
+    ]
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionSinCosTransform(apply_to=self.state_keys),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+class So100DataConfig(BaseDataConfig):
+    video_keys = ["video.webcam"]
+    state_keys = ["state.single_arm", "state.gripper"]
+    action_keys = ["action.single_arm", "action.gripper"]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+class So100DualCamDataConfig(So100DataConfig):
+    video_keys = ["video.front", "video.wrist"]
+    state_keys = ["state.single_arm", "state.gripper"]
+    action_keys = ["action.single_arm", "action.gripper"]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+###########################################################################################
+class UnitreeG1DataConfig(BaseDataConfig):
+    video_keys = ["video.rs_view"]
+    state_keys = ["state.left_arm", "state.right_arm", "state.left_hand", "state.right_hand"]
+    action_keys = ["action.left_arm", "action.right_arm", "action.left_hand", "action.right_hand"]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+class UnitreeG1FullBodyDataConfig(UnitreeG1DataConfig):
+    video_keys = ["video.rs_view"]
+    state_keys = [
+        "state.left_leg",
+        "state.right_leg",
+        "state.waist",
+        "state.left_arm",
+        "state.right_arm",
+        "state.left_hand",
+        "state.right_hand",
+    ]
+    action_keys = ["action.left_arm", "action.right_arm", "action.left_hand", "action.right_hand"]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+###########################################################################################
+class FourierGr1FullUpperBodyDataConfig(BaseDataConfig):
+    video_keys = ["video.front_view"]
+    state_keys = [
+        "state.left_arm",
+        "state.right_arm",
+        "state.left_hand",
+        "state.right_hand",
+        "state.waist",
+        "state.neck",
+    ]
+    action_keys = [
+        "action.left_arm",
+        "action.right_arm",
+        "action.left_hand",
+        "action.right_hand",
+        "action.waist",
+        "action.neck",
+    ]
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    def transform(self):
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+class BimanualPandaGripperDataConfig(BaseDataConfig):
+    video_keys = [
+        "video.right_wrist_view",
+        "video.left_wrist_view",
+        "video.front_view",
+    ]
+    state_keys = [
+        "state.right_arm_eef_pos",
+        "state.right_arm_eef_quat",
+        "state.right_gripper_qpos",
+        "state.left_arm_eef_pos",
+        "state.left_arm_eef_quat",
+        "state.left_gripper_qpos",
+    ]
+    action_keys = [
+        "action.right_arm_eef_pos",
+        "action.right_arm_eef_rot",
+        "action.right_gripper_close",
+        "action.left_arm_eef_pos",
+        "action.left_arm_eef_rot",
+        "action.left_gripper_close",
+    ]
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    # Used in StateActionTransform for normalization and target rotations
+    state_normalization_modes = {
+        "state.right_arm_eef_pos": "min_max",
+        "state.right_gripper_qpos": "min_max",
+        "state.left_arm_eef_pos": "min_max",
+        "state.left_gripper_qpos": "min_max",
+    }
+    state_target_rotations = {
+        "state.right_arm_eef_quat": "rotation_6d",
+        "state.left_arm_eef_quat": "rotation_6d",
+    }
+    action_normalization_modes = {
+        "action.right_gripper_close": "binary",
+        "action.left_gripper_close": "binary",
+    }
+    def transform(self):
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes=self.state_normalization_modes,
+                target_rotations=self.state_target_rotations,
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes=self.action_normalization_modes,
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+class BimanualPandaHandDataConfig(BimanualPandaGripperDataConfig):
+    video_keys = [
+        "video.right_wrist_view",
+        "video.left_wrist_view",
+        "video.ego_view",
+    ]
+    state_keys = [
+        "state.right_arm_eef_pos",
+        "state.right_arm_eef_quat",
+        "state.right_hand",
+        "state.left_arm_eef_pos",
+        "state.left_arm_eef_quat",
+        "state.left_hand",
+    ]
+    action_keys = [
+        "action.right_arm_eef_pos",
+        "action.right_arm_eef_rot",
+        "action.right_hand",
+        "action.left_arm_eef_pos",
+        "action.left_arm_eef_rot",
+        "action.left_hand",
+    ]
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    # Used in StateActionTransform for normalization and target rotations
+    state_normalization_modes = {
+        "state.right_arm_eef_pos": "min_max",
+        "state.right_hand": "min_max",
+        "state.left_arm_eef_pos": "min_max",
+        "state.left_hand": "min_max",
+    }
+    action_normalization_modes = {
+        "action.right_hand": "min_max",
+        "action.left_hand": "min_max",
+    }
+    state_target_rotations = {
+        "state.right_arm_eef_quat": "rotation_6d",
+        "state.left_arm_eef_quat": "rotation_6d",
+    }
+###########################################################################################
+class SinglePandaGripperDataConfig(BimanualPandaGripperDataConfig):
+    video_keys = [
+        "video.left_view",
+        "video.right_view",
+        "video.wrist_view",
+    ]
+    state_keys = [
+        "state.end_effector_position_relative",
+        "state.end_effector_rotation_relative",
+        "state.gripper_qpos",
+        "state.base_position",
+        "state.base_rotation",
+    ]
+    action_keys = [
+        "action.end_effector_position",
+        "action.end_effector_rotation",
+        "action.gripper_close",
+        "action.base_motion",
+        "action.control_mode",
+    ]
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    # Used in StateActionTransform for normalization and target rotations
+    state_normalization_modes = {
+        "state.end_effector_position_relative": "min_max",
+        "state.end_effector_rotation_relative": "min_max",
+        "state.gripper_qpos": "min_max",
+        "state.base_position": "min_max",
+        "state.base_rotation": "min_max",
+    }
+    state_target_rotations = {
+        "state.end_effector_rotation_relative": "rotation_6d",
+        "state.base_rotation": "rotation_6d",
+    }
+    action_normalization_modes = {
+        "action.end_effector_position": "min_max",
+        "action.end_effector_rotation": "min_max",
+        "action.gripper_close": "binary",
+        "action.base_motion": "min_max",
+        "action.control_mode": "binary",
+    }
+###########################################################################################
+class FourierGr1ArmsWaistDataConfig(FourierGr1ArmsOnlyDataConfig):
+    video_keys = ["video.ego_view"]
+    state_keys = [
+        "state.left_arm",
+        "state.right_arm",
+        "state.left_hand",
+        "state.right_hand",
+        "state.waist",
+    ]
+    action_keys = [
+        "action.left_arm",
+        "action.right_arm",
+        "action.left_hand",
+        "action.right_hand",
+        "action.waist",
+    ]
+    language_keys = ["annotation.human.coarse_action"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    def transform(self):
+        return super().transform()
+class FourierGr1ArmsWaistWithMANODataConfig(FourierGr1ArmsOnlyDataConfig):
+    video_keys = ["video.ego_view"]
+    state_keys = [
+        "state.left_arm",
+        "state.right_arm",
+        "state.left_hand",
+        "state.right_hand",
+        "state.waist",
+    ]
+    action_keys = [
+        "action.left_arm",
+        "action.right_arm",
+        "action.left_hand",
+        "action.right_hand",
+        "action.waist",
+    ]
+    language_keys = ["annotation.human.coarse_action"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    action_dim = 144
+    def transform(self):
+        # Get parent transform
+        parent_transform = super().transform()
+        # Extract the transforms list
+        transforms = parent_transform.transforms
+        # Find ConcatTransform index and insert tensor-to-numpy conversion after it
+        concat_idx = None
+        for i, t in enumerate(transforms):
+            if isinstance(t, ConcatTransform):
+                concat_idx = i
+                break
+        if concat_idx is not None:
+            # Create a simple transform to convert torch tensors to numpy for state/action
+            class TensorToNumpyTransform(ModalityTransform):
+                def apply(self, data: dict) -> dict:
+                    for key in ["state", "action"]:
+                        if key in data:
+                            value = data[key]
+                            # Check if it's a torch tensor by checking for torch tensor methods
+                            if (
+                                hasattr(value, "detach")
+                                and hasattr(value, "cpu")
+                                and hasattr(value, "numpy")
+                            ):
+                                data[key] = value.detach().cpu().numpy()
+                    return data
+            # Insert after ConcatTransform, before GR00TTransform
+            transforms.insert(concat_idx + 1, TensorToNumpyTransform(apply_to=[]))
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+class OxeDroidDataConfig(BaseDataConfig):
+    video_keys = [
+        "video.exterior_image_1",
+        "video.exterior_image_2",
+        "video.wrist_image",
+    ]
+    state_keys = [
+        "state.eef_position",
+        "state.eef_rotation",
+        "state.gripper_position",
+    ]
+    action_keys = [
+        "action.eef_position_delta",
+        "action.eef_rotation_delta",
+        "action.gripper_position",
+    ]
+    language_keys = ["annotation.language.language_instruction"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    def transform(self):
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={
+                    "state.eef_position": "min_max",
+                    "state.gripper_position": "min_max",
+                },
+                target_rotations={
+                    "state.eef_rotation": "rotation_6d",
+                },
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={
+                    "action.gripper_position": "binary",
+                },
+                target_rotations={"action.eef_rotation_delta": "axis_angle"},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+class AgibotGenie1DataConfig(BaseDataConfig):
+    video_keys = [
+        "video.top_head",
+        "video.hand_left",
+        "video.hand_right",
+    ]
+    state_keys = [
+        "state.left_arm_joint_position",
+        "state.right_arm_joint_position",
+        "state.left_effector_position",
+        "state.right_effector_position",
+        "state.head_position",
+        "state.waist_position",
+    ]
+    action_keys = [
+        "action.left_arm_joint_position",
+        "action.right_arm_joint_position",
+        "action.left_effector_position",
+        "action.right_effector_position",
+        "action.head_position",
+        "action.waist_position",
+        "action.robot_velocity",
+    ]
+    language_keys = ["annotation.language.action_text"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    def transform(self):
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+class Gr1DataConfig(BaseDataConfig):
+    video_keys = ["video.camera_ego", "video.camera_ext"]
+    state_keys = [
+        "state.torso_joints",
+        "state.head_joints",
+        "state.right_arm_joints",
+        "state.left_arm_joints",
+        "state.right_hand_joints",
+        "state.left_hand_joints",
+    ]
+    action_keys = [
+        "action.right_arm_eef_pos",
+        "action.left_arm_eef_pos",
+        "action.right_finger_joints",
+        "action.left_finger_joints",
+    ]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+class Gr1NoImageDataConfig(BaseDataConfig):
+    video_keys = []
+    state_keys = [
+        "state.torso_joints",
+        "state.head_joints",
+        "state.right_arm_joints",
+        "state.left_arm_joints",
+        "state.right_hand_joints",
+        "state.left_hand_joints",
+    ]
+    action_keys = [
+        "action.right_arm_eef_pos",
+        "action.left_arm_eef_pos",
+        "action.right_finger_joints",
+        "action.left_finger_joints",
+    ]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+    def transform(self) -> ModalityTransform:
+        transforms = []
+        # video transforms - only add if video_keys is not empty
+        if self.video_keys:
+            transforms.extend(
+                [
+                    VideoToTensor(apply_to=self.video_keys),
+                    VideoCrop(apply_to=self.video_keys, scale=0.95),
+                    VideoResize(
+                        apply_to=self.video_keys, height=224, width=224, interpolation="linear"
+                    ),
+                    VideoColorJitter(
+                        apply_to=self.video_keys,
+                        brightness=0.3,
+                        contrast=0.4,
+                        saturation=0.5,
+                        hue=0.08,
+                    ),
+                    VideoToNumpy(apply_to=self.video_keys),
+                ]
+            )
+        # state transforms
+        transforms.extend(
+            [
+                StateActionToTensor(apply_to=self.state_keys),
+                StateActionTransform(
+                    apply_to=self.state_keys,
+                    normalization_modes={key: "min_max" for key in self.state_keys},
+                ),
+            ]
+        )
+        # action transforms
+        transforms.extend(
+            [
+                StateActionToTensor(apply_to=self.action_keys),
+                StateActionTransform(
+                    apply_to=self.action_keys,
+                    normalization_modes={key: "min_max" for key in self.action_keys},
+                ),
+            ]
+        )
+        # concat transforms
+        transforms.append(
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            )
+        )
+        # model-specific transform
+        transforms.append(
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            )
+        )
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+class egodex_naive_config(BaseDataConfig):
+    video_keys = ["video.camera"]
+    state_keys = [
+        "state.left_hand",
+        "state.left_hand_rotation",
+        "state.left_hand_fingertips",
+        "state.right_hand",
+        "state.right_hand_rotation",
+        "state.right_hand_fingertips",
+    ]
+    action_keys = [
+        "action.left_hand",
+        "action.left_hand_rotation",
+        "action.left_hand_fingertips",
+        "action.right_hand",
+        "action.right_hand_rotation",
+        "action.right_hand_fingertips",
+    ]
+    language_keys = ["annotation.language_instruction"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    action_dim = 48
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.2,
+                contrast=0.2,
+                saturation=0.1,
+                hue=0.0,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "q99" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "q99" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=self.action_dim,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+class egodex_mano_config(BaseDataConfig):
+    video_keys = ["video.camera"]
+    state_keys = [
+        "state.left_hand",
+        "state.left_hand_rotation",
+        "state.left_hand_mano21_joints",
+        "state.right_hand",
+        "state.right_hand_rotation",
+        "state.right_hand_mano21_joints",
+    ]
+    action_keys = [
+        "action.left_hand",
+        "action.left_hand_rotation",
+        "action.left_hand_mano21_joints",
+        "action.right_hand",
+        "action.right_hand_rotation",
+        "action.right_hand_mano21_joints",
+    ]
+    language_keys = ["annotation.language_instruction"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    action_dim = 144
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.2,
+                contrast=0.2,
+                saturation=0.1,
+                hue=0.0,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "q99" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "q99" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=self.action_dim,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+class agibot_naive_config(BaseDataConfig):
+    video_keys = ["video.top_head", "video.hand_left", "video.hand_right"]
+    state_keys = ["state.observation_state"]
+    action_keys = ["action.action"]
+    language_keys = ["annotation.language_instruction"]
+    observation_indices = [0]
+    action_indices = list(range(22))  # 22-dimensional action
+    action_dim = 22
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.2,
+                contrast=0.2,
+                saturation=0.1,
+                hue=0.0,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "q99" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "q99" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=self.action_dim,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+class allex_thetwo_ck40_egostereo_config(BaseDataConfig):
+    video_keys = ["video.camera_ego_left", "video.camera_ego_right"]
+    state_keys = [
+        "state.right_arm_joints",
+        "state.left_arm_joints",
+        "state.right_hand_joints",
+        "state.left_hand_joints",
+        "state.neck_joints",
+        "state.waist_joints",
+    ]
+    action_keys = [
+        "action.right_arm_joints",
+        "action.left_arm_joints",
+        "action.right_hand_joints",
+        "action.left_hand_joints",
+        "action.neck_joints",
+        "action.waist_joints",
+    ]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(40))
+    action_dim = 48
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.2,
+                contrast=0.2,
+                saturation=0.2,
+                hue=0.1,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "q99" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "q99" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=self.action_dim,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+class openarm_ck40_egostereo_config(BaseDataConfig):
+    video_keys = ["video.camera_ego_left", "video.camera_ego_right"]
+    state_keys = [
+        "state.right_arm_joints",
+        "state.left_arm_joints",
+        "state.right_hand_joints",
+        "state.left_hand_joints",
+        "state.neck_joints",
+    ]
+    action_keys = [
+        "action.right_arm_joints",
+        "action.left_arm_joints",
+        "action.right_hand_joints",
+        "action.left_hand_joints",
+        "action.neck_joints",
+    ]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(40))
+    action_dim = 28
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.2,
+                contrast=0.2,
+                saturation=0.2,
+                hue=0.1,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "q99" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "q99" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=self.action_dim,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+class AgibotBetaDataConfig(BaseDataConfig):
+    video_keys = [
+        "video.top_head",
+    ]
+    state_keys = [
+        "state.left_arm_joint_position",
+        "state.right_arm_joint_position",
+        "state.left_effector_position",
+        "state.right_effector_position",
+        "state.head_position",
+        "state.waist_position",
+    ]
+    action_keys = [
+        "action.left_arm_joint_position",
+        "action.right_arm_joint_position",
+        "action.left_effector_position",
+        "action.right_effector_position",
+        "action.head_position",
+        "action.waist_position",
+    ]
+    language_keys = ["annotation.language.action_text"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    def transform(self):
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=32,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+class egodex_naive_config(BaseDataConfig):
+    video_keys = ["video.camera"]
+    state_keys = [
+        "state.left_hand",
+        "state.left_hand_rotation",
+        "state.left_hand_fingertips",
+        "state.right_hand",
+        "state.right_hand_rotation",
+        "state.right_hand_fingertips",
+    ]
+    action_keys = [
+        "action.left_hand",
+        "action.left_hand_rotation",
+        "action.left_hand_fingertips",
+        "action.right_hand",
+        "action.right_hand_rotation",
+        "action.right_hand_fingertips",
+    ]
+    language_keys = ["annotation.language_instruction"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    action_dim = 48
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(apply_to=self.video_keys, height=224, width=224, interpolation="linear"),
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.2,
+                contrast=0.2,
+                saturation=0.1,
+                hue=0.0,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "q99" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "q99" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            # model-specific transform
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=64,
+                max_action_dim=self.action_dim,
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+@dataclass
+class AllexSimDataConfig(BaseDataConfig):
+    video_keys = ["video.robot_pov_left_cam"]
+    state_keys = [
+        "state.left_eef_pos",
+        "state.left_eef_quat",
+        "state.right_eef_pos",
+        "state.right_eef_quat",
+        "state.hand_joint_state",
+        "state.head_joint_state",
+    ]
+    action_keys = [
+        "action.left_eef_pos",
+        "action.left_eef_quat",
+        "action.right_eef_pos",
+        "action.right_eef_quat",
+        "action.hand_joint_state",
+        "action.head_joint_state",
+    ]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(32))
+    action_dim = 46
+    # For a remote client which sends image with 224x224 resolution
+    # For _remote option, images are already resized to 224x224 by the environment
+    is_remote: bool = False
+    def transform(self):
+        transforms: list[ModalityTransform] = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys, check_resolution=not self.is_remote),
+        ]
+        if not self.is_remote:
+            transforms.extend(
+                [
+                    VideoCrop(apply_to=self.video_keys, scale=0.95),
+                    VideoResize(
+                        apply_to=self.video_keys, height=224, width=224, interpolation="linear"
+                    ),
+                ]
+            )
+        transforms.extend(
+            [
+                VideoColorJitter(
+                    apply_to=self.video_keys,
+                    brightness=0.3,
+                    contrast=0.4,
+                    saturation=0.5,
+                    hue=0.08,
+                ),
+                VideoToNumpy(apply_to=self.video_keys),
+                # state transforms
+                StateActionToTensor(apply_to=self.state_keys),
+                StateActionTransform(
+                    apply_to=self.state_keys,
+                    normalization_modes={key: "min_max" for key in self.state_keys},
+                ),
+                # action transforms
+                StateActionToTensor(apply_to=self.action_keys),
+                StateActionTransform(
+                    apply_to=self.action_keys,
+                    normalization_modes={key: "min_max" for key in self.action_keys},
+                ),
+                # concat transforms
+                ConcatTransform(
+                    video_concat_order=self.video_keys,
+                    state_concat_order=self.state_keys,
+                    action_concat_order=self.action_keys,
+                ),
+                GR00TTransform(
+                    state_horizon=len(self.observation_indices),
+                    action_horizon=len(self.action_indices),
+                    max_state_dim=64,
+                    max_action_dim=self.action_dim,
+                ),
+            ]
+        )
+        return ComposedModalityTransform(transforms=transforms)
+@dataclass
+class AllexSimWithoutHeadDataConfig(AllexSimDataConfig):
+    state_keys = [
+        "state.left_eef_pos",
+        "state.left_eef_quat",
+        "state.right_eef_pos",
+        "state.right_eef_quat",
+        "state.hand_joint_state",
+    ]
+    action_keys = [
+        "action.left_eef_pos",
+        "action.left_eef_quat",
+        "action.right_eef_pos",
+        "action.right_eef_quat",
+        "action.hand_joint_state",
+    ]
+    action_dim = 44
+@dataclass
+class AllexSimWithoutHeadDataConfig(AllexSimDataConfig):
+    state_keys = [
+        "state.left_eef_pos",
+        "state.left_eef_quat",
+        "state.right_eef_pos",
+        "state.right_eef_quat",
+        "state.hand_joint_state",
+    ]
+    action_keys = [
+        "action.left_eef_pos",
+        "action.left_eef_quat",
+        "action.right_eef_pos",
+        "action.right_eef_quat",
+        "action.hand_joint_state",
+    ]
+    action_dim = 44
+## Allex Real Data Configs
+# 1. Mono vs Stereo
+@dataclass
+class AllexRealMonoConfig(BaseDataConfig):
+    video_keys = ["video.camera_ego_left"]
+    state_keys = [
+        "state.right_arm_joints",
+        "state.left_arm_joints",
+        "state.right_hand_joints",
+        "state.left_hand_joints",
+        "state.neck_joints",
+        "state.waist_joints",
+    ]
+    action_keys = [
+        "action.right_arm_joints",
+        "action.left_arm_joints",
+        "action.right_hand_joints",
+        "action.left_hand_joints",
+        "action.neck_joints",
+        "action.waist_joints",
+    ]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+    action_dim = 48
+    is_remote: bool = False
+    def transform(self):
+        transforms = [
+            # video transforms
+            VideoToTensor(apply_to=self.video_keys, check_resolution=not self.is_remote),
+        ]
+        if not self.is_remote:
+            transforms.extend(
+                [
+                    VideoCrop(apply_to=self.video_keys, scale=0.95),
+                    VideoResize(
+                        apply_to=self.video_keys, height=224, width=224, interpolation="linear"
+                    ),
+                ]
+            )
+        transforms.extend(
+            [
+            VideoColorJitter(
+                apply_to=self.video_keys,
+                brightness=0.3,
+                contrast=0.4,
+                saturation=0.5,
+                hue=0.08,
+            ),
+            VideoToNumpy(apply_to=self.video_keys),
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={key: "min_max" for key in self.state_keys},
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+            # concat transforms
+            ConcatTransform(
+                video_concat_order=self.video_keys,
+                state_concat_order=self.state_keys,
+                action_concat_order=self.action_keys,
+            ),
+            GR00TTransform(
+                state_horizon=len(self.observation_indices),
+                action_horizon=len(self.action_indices),
+                max_state_dim=66,
+                max_action_dim=self.action_dim,
+            ),
+        ]
+    )
+        return ComposedModalityTransform(transforms=transforms)
+class AllexRealStereoConfig(AllexRealMonoConfig):
+    video_keys = ["video.camera_ego_left", "video.camera_ego_right"]
+@dataclass
+class Rby1WujiDataConfig(BaseDataConfig):
+    video_keys = ["video.zed_left", "video.zed_right"]
+    state_keys = ["state.joint_position"]
+    action_keys = ["action.joint_position"]
+    language_keys = ["annotation.human.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(40))
+    action_dim = 66
+    def transform(self):
+        transforms = [
+            VideoToTensor(apply_to=self.video_keys),
+            VideoCrop(apply_to=self.video_keys, scale=0.95),
+            VideoResize(
+                apply_to=self.video_keys, height=224, width=224, interpolation="linear"
+            ),
+        ]
+        transforms.extend(
+            [
+                VideoColorJitter(
+                    apply_to=self.video_keys,
+                    brightness=0.5,
+                    contrast=0.5,
+                    saturation=0.5,
+                    hue=0.2,
+                ),
+                VideoToNumpy(apply_to=self.video_keys),
+                StateActionToTensor(apply_to=self.state_keys),
+                StateActionTransform(
+                    apply_to=self.state_keys,
+                    normalization_modes={key: "min_max" for key in self.state_keys},
+                ),
+                StateActionToTensor(apply_to=self.action_keys),
+                StateActionTransform(
+                    apply_to=self.action_keys,
+                    normalization_modes={key: "min_max" for key in self.action_keys},
+                ),
+                ConcatTransform(
+                    video_concat_order=self.video_keys,
+                    state_concat_order=self.state_keys,
+                    action_concat_order=self.action_keys,
+                ),
+                GR00TTransform(
+                    state_horizon=len(self.observation_indices),
+                    action_horizon=len(self.action_indices),
+                    max_state_dim=66,
+                    max_action_dim=self.action_dim,
+                ),
+            ]
+        )
+        return ComposedModalityTransform(transforms=transforms)
+###########################################################################################
+DATA_CONFIG_MAP = {
+    "fourier_gr1_arms_waist": FourierGr1ArmsWaistDataConfig(),
+    "fourier_gr1_arms_waist_with_mano": FourierGr1ArmsWaistWithMANODataConfig(),
+    "fourier_gr1_arms_only": FourierGr1ArmsOnlyDataConfig(),
+    "fourier_gr1_full_upper_body": FourierGr1FullUpperBodyDataConfig(),
+    "bimanual_panda_gripper": BimanualPandaGripperDataConfig(),
+    "bimanual_panda_hand": BimanualPandaHandDataConfig(),
+    "single_panda_gripper": SinglePandaGripperDataConfig(),
+    "so100": So100DataConfig(),
+    "so100_dualcam": So100DualCamDataConfig(),
+    "unitree_g1": UnitreeG1DataConfig(),
+    "unitree_g1_full_body": UnitreeG1FullBodyDataConfig(),
+    "oxe_droid": OxeDroidDataConfig(),
+    "agibot_genie1": AgibotGenie1DataConfig(),
+    "gr1": Gr1DataConfig(),
+    "gr1_no_image": Gr1NoImageDataConfig(),
+    "allex_thetwo_ck40_egostereo": allex_thetwo_ck40_egostereo_config(),
+    "openarm_ck40_egostereo" : openarm_ck40_egostereo_config(),
+    "egodex_naive": egodex_naive_config(),
+    "egodex_mano": egodex_mano_config(),
+    "agibot_naive": agibot_naive_config(),
+    "agibot_beta1": AgibotBetaDataConfig(),
+    "allex_sim": AllexSimDataConfig(),
+    "allex_sim_remote": AllexSimDataConfig(is_remote=True),
+    "allex_real_mono": AllexRealMonoConfig(),
+    "allex_real_stereo": AllexRealStereoConfig(),
+    "allex_sim_mono": AllexRealMonoConfig(is_remote=True),
+    "allex_sim_stereo": AllexRealStereoConfig(is_remote=True),
+    "allex_sim_without_head": AllexSimWithoutHeadDataConfig(),
+    "allex_sim_without_head_remote": AllexSimWithoutHeadDataConfig(is_remote=True),
+    "rby1_wuji": Rby1WujiDataConfig(),
+}

checkpoint-10000/experiment_cfg/metadata.json ADDED Viewed

	@@ -0,0 +1,871 @@

+{
+    "new_embodiment": {
+        "statistics": {
+            "state": {
+                "joint_position": {
+                    "max": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0540158674120903,
+                        0.008619149215519428,
+                        -0.01737499050796032,
+                        0.011375758796930313,
+                        0.009750650264322758,
+                        0.24627606570720673,
+                        1.0723133087158203,
+                        -0.03839508444070816,
+                        0.9127033948898315,
+                        0.00562712736427784,
+                        1.8438678979873657,
+                        0.9284341931343079,
+                        0.01166592352092266,
+                        1.668798804283142,
+                        0.44089144468307495,
+                        0.9650468230247498,
+                        1.6112595796585083,
+                        1.2180935144424438,
+                        0.2822831869125366,
+                        1.6737557649612427,
+                        1.6050000190734863,
+                        1.4051698446273804,
+                        0.08793547004461288,
+                        1.6089346408843994,
+                        1.5881896018981934,
+                        1.6058990955352783,
+                        0.04430322349071503,
+                        1.642262578010559,
+                        1.6696302890777588,
+                        1.5714726448059082,
+                        0.15678425133228302,
+                        1.597379446029663,
+                        1.6042011976242065,
+                        0.5594549179077148,
+                        0.6180081963539124,
+                        0.7730085849761963,
+                        -0.004590551368892193,
+                        1.4090687036514282,
+                        0.8553937077522278,
+                        2.8069164752960205,
+                        1.5063496828079224,
+                        0.3504810333251953,
+                        0.3728719651699066,
+                        1.5741111040115356,
+                        0.8213971853256226,
+                        0.04377385973930359,
+                        0.019891871139407158,
+                        0.7548595666885376,
+                        0.7196071743965149,
+                        0.10614115744829178,
+                        0.03523240610957146,
+                        0.7286884784698486,
+                        0.7055163383483887,
+                        0.1200566291809082,
+                        0.015419094823300838,
+                        0.32709723711013794,
+                        0.36708328127861023,
+                        0.03174339234828949,
+                        0.2119518369436264,
+                        1.3741862773895264,
+                        3.834952167380834e-06,
+                        -0.5998210310935974
+                    ],
+                    "min": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        -0.01123147364705801,
+                        -0.07895824313163757,
+                        -0.05312357842922211,
+                        -0.011371961794793606,
+                        -0.009332981891930103,
+                        -1.4737378358840942,
+                        -0.7509519457817078,
+                        -0.8685862421989441,
+                        -0.7078827619552612,
+                        -2.4324493408203125,
+                        -1.2745846509933472,
+                        -1.5293941497802734,
+                        -2.6313905715942383,
+                        -0.006295211613178253,
+                        -0.21899408102035522,
+                        -0.00438659219071269,
+                        -0.06925009936094284,
+                        -0.15651625394821167,
+                        -0.10530298203229904,
+                        -0.025816796347498894,
+                        -0.15664206445217133,
+                        -0.1108212023973465,
+                        -0.3239299952983856,
+                        -0.12454989552497864,
+                        -0.013239393942058086,
+                        -0.13757586479187012,
+                        -0.21419131755828857,
+                        -0.029919717460870743,
+                        -0.16278579831123352,
+                        -0.24725651741027832,
+                        -0.20329144597053528,
+                        -0.040407828986644745,
+                        -0.45288756489753723,
+                        -0.5406219363212585,
+                        0.05294891446828842,
+                        -1.610653281211853,
+                        -1.3624293804168701,
+                        -1.4776990413665771,
+                        -1.1493887901306152,
+                        0.5057151317596436,
+                        -0.23366180062294006,
+                        -0.005929233506321907,
+                        -0.01660371571779251,
+                        -0.01076052337884903,
+                        -0.004445623606443405,
+                        -0.0933440625667572,
+                        -0.00807812251150608,
+                        -0.0057681952603161335,
+                        -0.006203831639140844,
+                        -0.06876560300588608,
+                        -0.030899088829755783,
+                        -0.004242096561938524,
+                        -0.005740335676819086,
+                        -0.02193913422524929,
+                        -0.01582074724137783,
+                        -0.1694127470254898,
+                        -0.13835637271404266,
+                        -0.16616317629814148,
+                        -0.11304554343223572,
+                        -0.020096570253372192,
+                        -0.00011888350854860619,
+                        -0.6000050902366638
+                    ],
+                    "mean": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.028436615020000926,
+                        -0.03310545571542166,
+                        -0.02831603293043386,
+                        -0.002489809312055913,
+                        0.006085372967137075,
+                        -0.3857839695392187,
+                        -0.014985148514616058,
+                        -0.20059175430304702,
+                        0.020276488908730312,
+                        -1.0443872404524919,
+                        0.023258577339383097,
+                        -0.1142688779204578,
+                        -1.3937374825429794,
+                        0.6558063841791327,
+                        -0.08927425675725988,
+                        0.5556008032503482,
+                        0.3315836355578498,
+                        0.46150773638786774,
+                        0.07591597844533991,
+                        0.14104579194652192,
+                        0.5127873260199133,
+                        0.4920027032391028,
+                        -0.10307410965098776,
+                        0.07758055805280992,
+                        0.7382627254586968,
+                        0.47719562014059813,
+                        -0.07254886993992268,
+                        0.030039534178359563,
+                        0.5418144791439871,
+                        0.4868134670421312,
+                        -0.0349460562227702,
+                        0.050387433762820816,
+                        0.7191163128647389,
+                        0.12979891806355587,
+                        0.19467059752776283,
+                        -0.32792739139429206,
+                        -0.5835160602302355,
+                        -0.08403944264891663,
+                        0.1390610603205583,
+                        1.8909284911263653,
+                        0.4950957846158421,
+                        0.2381789575086811,
+                        0.22612730576569545,
+                        0.18883140589784606,
+                        0.3740392055027783,
+                        -0.04032462804929162,
+                        0.008036058131466807,
+                        0.37591508523796946,
+                        0.32645487851920024,
+                        0.03241679756760204,
+                        0.004207118530478635,
+                        0.2470434163746666,
+                        0.3192236355839802,
+                        0.04590547429864797,
+                        0.0011438596929481235,
+                        0.026781440791798408,
+                        0.15448183264024357,
+                        -0.07243912484280461,
+                        -0.07909043829547209,
+                        0.8210403514325914,
+                        -6.785587383775982e-07,
+                        -0.5999860812434155
+                    ],
+                    "std": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.009296067132398588,
+                        0.01086359387947464,
+                        0.006314352853837057,
+                        0.002925380853794191,
+                        0.0020385652110806902,
+                        0.5772557183512236,
+                        0.2241035469436585,
+                        0.15809088377394892,
+                        0.22598680879665564,
+                        0.4978266314448067,
+                        0.3561884996721128,
+                        0.3675321177585772,
+                        0.42166286761675736,
+                        0.36183275641474494,
+                        0.10626282067482194,
+                        0.12482011798152456,
+                        0.26866527570065346,
+                        0.30810642814769373,
+                        0.03734363734743939,
+                        0.2114097732846818,
+                        0.3669871051808872,
+                        0.32013936266443427,
+                        0.03549505877621235,
+                        0.22845467340942432,
+                        0.36432746939207455,
+                        0.36440028820190384,
+                        0.03600196760548027,
+                        0.13540730582439536,
+                        0.4923920481254162,
+                        0.36305092071697675,
+                        0.04654938606664865,
+                        0.13235952485277672,
+                        0.544977801560137,
+                        0.19417089262103823,
+                        0.15282655265234688,
+                        0.39691415272888464,
+                        0.36750595534209435,
+                        0.2645354107326195,
+                        0.29782887812728026,
+                        0.3383129775685425,
+                        0.24432653890860784,
+                        0.08139908079444687,
+                        0.07271124468975532,
+                        0.10743092529527191,
+                        0.14195018048311717,
+                        0.020357388766090088,
+                        0.003097414753963125,
+                        0.13929327335560418,
+                        0.13106278870888993,
+                        0.026678754705602218,
+                        0.00694399976100214,
+                        0.10023644500237974,
+                        0.12793384947018274,
+                        0.02815470361234968,
+                        0.003935586811798535,
+                        0.0694829081784891,
+                        0.07363449848839962,
+                        0.03472060782374721,
+                        0.026450936545833404,
+                        0.2973850084596476,
+                        8.890689092689295e-06,
+                        5.75834431548448e-05
+                    ],
+                    "q01": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.021795967778423478,
+                        -0.05696901685593958,
+                        -0.03935664577863019,
+                        -0.006686112227699354,
+                        0.0034296106734705363,
+                        -1.3782285479010636,
+                        -0.4497670105132723,
+                        -0.6115826922007739,
+                        -0.471112083088,
+                        -1.8056593314667733,
+                        -0.8440315607185341,
+                        -1.1620166715406945,
+                        -2.2094820753430424,
+                        0.004424640155203718,
+                        -0.20477636585097375,
+                        0.002928900425599604,
+                        0.0015383478993190457,
+                        -0.01212095601448354,
+                        -0.008005311749701495,
+                        0.11194821075318842,
+                        0.0034959941221639835,
+                        -0.0065810612466863224,
+                        -0.16402095284923815,
+                        -0.002347375323843718,
+                        0.004417142143493669,
+                        -0.02706049374707401,
+                        -0.15492323031416538,
+                        -0.006735840077794811,
+                        -0.016644652454015817,
+                        -0.05916948924771991,
+                        -0.14416287514004364,
+                        -0.006907277723886991,
+                        -0.14870543771038225,
+                        -0.31457522036852636,
+                        0.0667847342818645,
+                        -1.2423695252790405,
+                        -1.2364301253427723,
+                        -0.676006123363979,
+                        -0.6129555665262777,
+                        1.3263865136350677,
+                        -0.05522501892142832,
+                        0.0018785531286670443,
+                        0.005422156973198853,
+                        0.0055092103669175715,
+                        0.002438621935963896,
+                        -0.06136239921327097,
+                        -0.0003404852201681719,
+                        0.00407590085360591,
+                        0.0025376104864900176,
+                        -0.02240152342086038,
+                        -0.011012942661455737,
+                        0.0030068742157323704,
+                        0.002395644428253512,
+                        -0.0053736540469160685,
+                        -0.006330441670826642,
+                        -0.0917777787511835,
+                        -0.009702130489274563,
+                        -0.13127400019864632,
+                        -0.09008882077147813,
+                        0.010362566773705529,
+                        -2.4414687376485298e-06,
+                        -0.5999960111901739
+                    ],
+                    "q99": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.04437459776898255,
+                        -0.006718148228764021,
+                        -0.018936048997243122,
+                        0.004192877528633489,
+                        0.007735770262043791,
+                        0.12165437145979416,
+                        0.4240362063051912,
+                        -0.04807744057382159,
+                        0.5348454862089645,
+                        -0.2242267483064094,
+                        0.6785139612746233,
+                        0.44063327108356026,
+                        -0.5817907597011159,
+                        1.4848808947644698,
+                        0.23677060359863844,
+                        0.8047608976578217,
+                        1.2885358066775385,
+                        1.121296234630144,
+                        0.1530053280244578,
+                        0.47159374137911664,
+                        1.4497268026096963,
+                        1.209493986111197,
+                        0.000607743071994785,
+                        1.1285918553517904,
+                        1.4852443026351887,
+                        1.40864529914145,
+                        0.0005045250833275262,
+                        0.6529258515675681,
+                        1.5684983484657407,
+                        1.410548214903808,
+                        0.04141691976666532,
+                        0.4324983713584415,
+                        1.581882411962951,
+                        0.4360824992849159,
+                        0.536577685217296,
+                        0.17271069713926454,
+                        -0.1923993535500321,
+                        0.6603449199506704,
+                        0.6146886826649628,
+                        2.5299440392122987,
+                        0.8734007741005431,
+                        0.2945541165562756,
+                        0.2660609760662737,
+                        0.4397959618181715,
+                        0.561333966299661,
+                        0.016765962069138527,
+                        0.013682085635355636,
+                        0.5505121417687537,
+                        0.5398925788148042,
+                        0.0695841343717276,
+                        0.01839643415234168,
+                        0.4144001419639858,
+                        0.49493986038591653,
+                        0.08391836784040793,
+                        0.008520232199757685,
+                        0.20705310424289292,
+                        0.25758608095714447,
+                        0.005206322471282864,
+                        0.0012976095516473085,
+                        1.1263225634065734,
+                        7.982910483274839e-07,
+                        -0.5999537469885612
+                    ]
+                }
+            },
+            "action": {
+                "joint_position": {
+                    "max": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.03857817500829697,
+                        0.00860599521547556,
+                        0.0,
+                        0.011358398012816906,
+                        0.008795554749667645,
+                        0.2635219991207123,
+                        1.1436481475830078,
+                        -0.03254669904708862,
+                        0.9616425037384033,
+                        0.005560107994824648,
+                        1.9167735576629639,
+                        0.9553606510162354,
+                        0.14417271316051483,
+                        1.6008000373840332,
+                        0.42979976534843445,
+                        0.9053434133529663,
+                        1.5772000551223755,
+                        1.234971523284912,
+                        0.1844240427017212,
+                        1.5772000551223755,
+                        1.5772000551223755,
+                        1.5085276365280151,
+                        0.006402143742889166,
+                        1.5772000551223755,
+                        1.5772000551223755,
+                        1.5859500169754028,
+                        0.019420389086008072,
+                        1.5772000551223755,
+                        1.5772000551223755,
+                        1.5859500169754028,
+                        0.06968191266059875,
+                        1.5772000551223755,
+                        1.5772000551223755,
+                        0.6093692779541016,
+                        0.6748019456863403,
+                        0.8214342594146729,
+                        -0.004697862546890974,
+                        1.4369226694107056,
+                        0.9396975636482239,
+                        2.7821881771087646,
+                        1.6008000373840332,
+                        0.3493163585662842,
+                        0.3661975860595703,
+                        1.5772000551223755,
+                        0.8197538256645203,
+                        0.026058457791805267,
+                        0.015132924541831017,
+                        0.7475559711456299,
+                        0.7178741693496704,
+                        0.10578178614377975,
+                        0.015168641693890095,
+                        0.7295849919319153,
+                        0.6987736821174622,
+                        0.14835047721862793,
+                        0.009813961572945118,
+                        0.32408037781715393,
+                        0.3626656234264374,
+                        0.026074068620800972,
+                        0.2016652673482895,
+                        1.3755035400390625,
+                        3.834952167380834e-06,
+                        -0.5998210310935974
+                    ],
+                    "min": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        -0.009972152300179005,
+                        -0.05403241515159607,
+                        -0.04206079617142677,
+                        -0.008545337244868279,
+                        -0.008585446514189243,
+                        -1.4895294904708862,
+                        -0.9075872302055359,
+                        -0.9320250153541565,
+                        -0.7501844167709351,
+                        -2.5271456241607666,
+                        -1.3021537065505981,
+                        -1.559999942779541,
+                        -2.700000047683716,
+                        -0.004508104640990496,
+                        -0.11590000241994858,
+                        -0.002204202115535736,
+                        -0.06962385773658752,
+                        -0.1454038769006729,
+                        -0.004906882997602224,
+                        -0.0047397250309586525,
+                        -0.15006792545318604,
+                        -0.10465795546770096,
+                        -0.20927758514881134,
+                        -0.0025141574442386627,
+                        -0.00471277441829443,
+                        -0.13493074476718903,
+                        -0.21192054450511932,
+                        -0.006507838144898415,
+                        -0.16269022226333618,
+                        -0.24437615275382996,
+                        -0.19966383278369904,
+                        -0.007696053013205528,
+                        -0.4431999921798706,
+                        -0.641176164150238,
+                        0.03254669904708862,
+                        -1.6302410364151,
+                        -1.4542460441589355,
+                        -1.5116034746170044,
+                        -1.1882280111312866,
+                        0.38480550050735474,
+                        -0.04479999840259552,
+                        -0.004017750732600689,
+                        -0.007120981812477112,
+                        -0.008143257349729538,
+                        -0.0033251529093831778,
+                        -0.09708409011363983,
+                        -0.003473518416285515,
+                        -0.0041289618238806725,
+                        -0.004991866648197174,
+                        -0.06705081462860107,
+                        -0.01580159179866314,
+                        -0.0027919195126742125,
+                        -0.004558212589472532,
+                        -0.01956297643482685,
+                        -0.00846139620989561,
+                        -0.16097694635391235,
+                        -0.1373649537563324,
+                        -0.21344037353992462,
+                        -0.09165584295988083,
+                        -0.011517560109496117,
+                        -0.00011888350854860619,
+                        -0.6000050902366638
+                    ],
+                    "mean": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0012301454489393134,
+                        -0.0014178637151902394,
+                        -0.0011515726267851373,
+                        -8.99514191918264e-05,
+                        0.0002417336261065251,
+                        -0.385079695781938,
+                        -0.056512973414585665,
+                        -0.205284599628657,
+                        0.021670858474119963,
+                        -1.090174144042536,
+                        0.02189628348629755,
+                        -0.11948687818374201,
+                        -1.3953129200280388,
+                        0.7158406793650635,
+                        -0.07399009379515412,
+                        0.5646530928317394,
+                        0.3322831628858908,
+                        0.4907992098567843,
+                        0.09187406823457182,
+                        0.022187885216056677,
+                        0.5144590627367867,
+                        0.5269970564099565,
+                        -0.10383331986413583,
+                        0.07851809721461797,
+                        0.7411814242423589,
+                        0.5444023280221095,
+                        -0.0734774868730444,
+                        0.03564721708701069,
+                        0.5437700061517406,
+                        0.5513370593516884,
+                        -0.036397602421378954,
+                        0.05125830440624985,
+                        0.7203868271344417,
+                        0.11542250590215877,
+                        0.1909098924988445,
+                        -0.32824815124911455,
+                        -0.6069722333678652,
+                        -0.08471856718111564,
+                        0.13363292776968674,
+                        1.914587773657288,
+                        0.5648819695933174,
+                        0.2560727698616553,
+                        0.24167851919247058,
+                        0.20062104627304525,
+                        0.3984462185737324,
+                        -0.0685205105848946,
+                        0.008445595006489948,
+                        0.40068832818889344,
+                        0.34675098094776313,
+                        0.03585122773362594,
+                        0.0048274081093767985,
+                        0.2606034069992753,
+                        0.33831379298242215,
+                        0.05275099718188223,
+                        0.0005992490332076127,
+                        0.024840247042178354,
+                        0.16453840496608627,
+                        -0.10364606618086784,
+                        -0.08398711226926625,
+                        0.8792984481160785,
+                        -6.785587383775982e-07,
+                        -0.5999860812434155
+                    ],
+                    "std": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.005364887313739734,
+                        0.0061500188820826284,
+                        0.00482324811058198,
+                        0.0006118624749094115,
+                        0.0010698437805450641,
+                        0.5759163621187422,
+                        0.25314037443712,
+                        0.1745444488366029,
+                        0.23248598496176517,
+                        0.5151875566816371,
+                        0.363109875655475,
+                        0.37494234688690026,
+                        0.45561049100001433,
+                        0.44951049081390626,
+                        0.09446383905433008,
+                        0.12122754061324706,
+                        0.2648716126033152,
+                        0.3501251981903186,
+                        0.03332956813047136,
+                        0.10150862839363956,
+                        0.3652349936001306,
+                        0.37369597719425024,
+                        0.03457925261671847,
+                        0.23152506479876248,
+                        0.36023274522277116,
+                        0.4740673099425935,
+                        0.035694487186065826,
+                        0.15188985568739302,
+                        0.4924371132376949,
+                        0.45327276640155423,
+                        0.046409641970985446,
+                        0.13415159186384895,
+                        0.543731809265607,
+                        0.21463303789677357,
+                        0.17222252415980077,
+                        0.3952559882935496,
+                        0.3814561269812472,
+                        0.2757738719846831,
+                        0.3065243714559272,
+                        0.3545466688724501,
+                        0.21844354204798888,
+                        0.053350533882705435,
+                        0.043587100511912974,
+                        0.09799630470643801,
+                        0.10512987970663229,
+                        0.016380302384406024,
+                        0.002169456788049435,
+                        0.10070984351883364,
+                        0.10163106548753786,
+                        0.026399777941356704,
+                        0.0020461802224458975,
+                        0.08102576186444525,
+                        0.09971133875885727,
+                        0.028058008122129583,
+                        0.0017529504319345533,
+                        0.07209880855335549,
+                        0.06331904047058992,
+                        0.04149379483074872,
+                        0.01843194410669704,
+                        0.21029348506622267,
+                        8.890689092689295e-06,
+                        5.758346966517874e-05
+                    ],
+                    "q01": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        -0.00011170007204678288,
+                        -0.0350740840703673,
+                        -0.02649646236281921,
+                        -0.0028050727380635027,
+                        -9.426515632575396e-05,
+                        -1.374807900157358,
+                        -0.5354302153081253,
+                        -0.6473194788886907,
+                        -0.49177669041598987,
+                        -1.8780234500237012,
+                        -0.85355856517312,
+                        -1.188833228891968,
+                        -2.3189886683615177,
+                        0.025163513809203716,
+                        -0.11590000241994858,
+                        0.027254616334218617,
+                        0.006036052553036485,
+                        -0.009530988392684247,
+                        0.0035405991289481093,
+                        0.0007229138612139622,
+                        0.01636307029640254,
+                        -0.0004343032527643275,
+                        -0.16726862426587313,
+                        -4.8836113049831653e-05,
+                        0.02567418749589847,
+                        -0.024016833289696688,
+                        -0.15735250287967065,
+                        -0.0008375468527870527,
+                        -0.009714188193280066,
+                        -0.05747116584319238,
+                        -0.14910954702118515,
+                        -0.001929540405106106,
+                        -0.15039529739554428,
+                        -0.37968197775437307,
+                        0.03254669904708863,
+                        -1.2543870126546852,
+                        -1.303943062692862,
+                        -0.6799536316732782,
+                        -0.6303896984746518,
+                        1.2865705168038077,
+                        0.006401516842819814,
+                        0.013067307866088613,
+                        0.01262725118611539,
+                        0.011174036935239103,
+                        0.019966230845197767,
+                        -0.08647492251316174,
+                        0.0006215869344666161,
+                        0.02203495231208817,
+                        0.01725570048370192,
+                        -0.021439176000207998,
+                        -0.0023131370850054977,
+                        0.014250255564199055,
+                        0.01823869361150577,
+                        -0.003908939982068881,
+                        -0.0027584949224312687,
+                        -0.10074178677307907,
+                        -0.004372001140849619,
+                        -0.1818862571741591,
+                        -0.09023238522192853,
+                        0.04855890365244546,
+                        -2.4414687376485298e-06,
+                        -0.5999960111901739
+                    ],
+                    "q99": [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.029449282894238496,
+                        0.00016742971014083042,
+                        0.0,
+                        0.00031566800135744813,
+                        0.0057871130284906916,
+                        0.12873437985015881,
+                        0.4299298342542301,
+                        -0.03254669904708863,
+                        0.5437001126633425,
+                        -0.22140740429409098,
+                        0.6898871616651259,
+                        0.4431390503500722,
+                        -0.5147745608161652,
+                        1.5909486264827468,
+                        0.23712931532251702,
+                        0.8461429033868172,
+                        1.297922698244089,
+                        1.1621481931028022,
+                        0.15062389224667413,
+                        0.36973000228336933,
+                        1.4482291357628363,
+                        1.3591272694418859,
+                        -0.004566694395975754,
+                        1.1618997356401126,
+                        1.4852974791242242,
+                        1.5427211015035178,
+                        -0.0019208977371253182,
+                        0.7870620850309203,
+                        1.5673701527367643,
+                        1.4620851178571561,
+                        0.039924607576713726,
+                        0.4326375294264343,
+                        1.5772000551223755,
+                        0.46008667598219477,
+                        0.57853004006268,
+                        0.19030685767770902,
+                        -0.18930773311367147,
+                        0.6756148051964173,
+                        0.6256650970198077,
+                        2.5507818254221712,
+                        0.9547109888525797,
+                        0.3131930388847934,
+                        0.2795055230941873,
+                        0.45624038111699255,
+                        0.5930317936822368,
+                        0.0029542181982621656,
+                        0.012190042270950462,
+                        0.581510587321943,
+                        0.5697987599313858,
+                        0.07413654923497481,
+                        0.008606564642661909,
+                        0.43301102078462383,
+                        0.523493119847704,
+                        0.09345735818405843,
+                        0.00552060761151478,
+                        0.21234308337554603,
+                        0.27454811268900675,
+                        -0.0007255198397156713,
+                        0.0025463204460750163,
+                        1.1936579220245007,
+                        7.982910483274839e-07,
+                        -0.5999537469885612
+                    ]
+                }
+            }
+        },
+        "modalities": {
+            "video": {
+                "zed_left": {
+                    "resolution": [
+                        256,
+                        256
+                    ],
+                    "channels": 3,
+                    "fps": 20.0
+                },
+                "zed_right": {
+                    "resolution": [
+                        256,
+                        256
+                    ],
+                    "channels": 3,
+                    "fps": 20.0
+                }
+            },
+            "state": {
+                "joint_position": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        66
+                    ],
+                    "continuous": true
+                }
+            },
+            "action": {
+                "joint_position": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        66
+                    ],
+                    "continuous": true
+                }
+            }
+        },
+        "embodiment_tag": "new_embodiment"
+    }
+}

checkpoint-10000/experiment_cfg/train_config.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+train:
+  datasets:
+    - path: "/rlwrld1/home/hazel/ms_demo_v3/260323/ms_demo_ver3_0"
+      embodiment_tag: "new_embodiment"
+      data_config: rby1_wuji
+      weight: 1.0
+    - path: "/rlwrld1/home/hazel/ms_demo_v3/260323/ms_demo_ver3_1"
+      embodiment_tag: "new_embodiment"
+      data_config: rby1_wuji
+      weight: 1.0
+    - path: "/rlwrld1/home/hazel/ms_demo_v3/260323/ms_demo_ver3_2"
+      embodiment_tag: "new_embodiment"
+      data_config: rby1_wuji
+      weight: 1.0
+    - path: "/rlwrld1/home/hazel/ms_demo_v3/260323/ms_demo_ver3_3"
+      embodiment_tag: "new_embodiment"
+      data_config: rby1_wuji
+      weight: 1.0
+    - path: "/rlwrld1/home/hazel/ms_demo_v3/260323/ms_demo_ver3_4"
+      embodiment_tag: "new_embodiment"
+      data_config: rby1_wuji
+      weight: 1.0
+    - path: "/rlwrld1/home/hazel/ms_demo_v3/260323/ms_demo_ver3_5"
+      embodiment_tag: "new_embodiment"
+      data_config: rby1_wuji
+      weight: 1.0
+    - path: "/rlwrld1/home/hazel/ms_demo_v3/260323/ms_demo_ver3_6"
+      embodiment_tag: "new_embodiment"
+      data_config: rby1_wuji
+      weight: 1.0
+    - path: "/rlwrld1/home/hazel/ms_demo_v3/260323/ms_demo_ver3_7_0"
+      embodiment_tag: "new_embodiment"
+      data_config: rby1_wuji
+      weight: 1.0
+    - path: "/rlwrld1/home/hazel/ms_demo_v3/260323/ms_demo_ver3_8"
+      embodiment_tag: "new_embodiment"
+      data_config: rby1_wuji
+      weight: 1.0

checkpoint-10000/model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14908e04dd5bf0d5746e27c0f81c4cfafa7a89668f998e4bd2b0245c1bde5669
+size 4999367032

checkpoint-10000/model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bebfebd16daa582597b63d9a789384a360f79a8ac38b68aeb9e41b7f1edc2abc
+size 2598112928

checkpoint-10000/model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-10000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75ee7106b5ed3e22dc9def8f9f833c64f93e2d886bec64a1dda0471299903608
+size 10294076454

checkpoint-10000/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66f51a2fb291871d931fca0284bd275daa49470f11ca93e3e2a180421934eafa
+size 14512

checkpoint-10000/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c27c78c066185cc19cb571917f2bd8114e8b6f2a6173ff0a861184a975c508d
+size 14512

checkpoint-10000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69cc8a263533866a30bbd70c695b627ba4dea5733aa097b95dcfa6e7f21407ca
+size 1064

checkpoint-10000/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

runs/Mar24_21-00-46_worker-node1000/events.out.tfevents.1774353663.worker-node1000.426899.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77c5b3d34a3c01bd2c12a993adc91657d234d70204c4a3314c32263cf03db3cc
-size 158231

 version https://git-lfs.github.com/spec/v1
+oid sha256:db725c818899f375fd59945196f2c3272772534b755e745620fc0224ccf50184
+size 229549