diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..9546427ecfa969c490c4b431d743434f82ff0a0a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +trainer_state.json filter=lfs diff=lfs merge=lfs -text diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6121c7cda47d8480788a4463968d41d005c830cf --- /dev/null +++ b/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "/mnt/amlfs-01/home/seonghyeony/checkpoints/groot_s_idm_24P_300", + "action_dim": 32, + "action_head_cfg": { + "_convert_": "object", + "_target_": "gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDM", + "config": { + "_recursive_": false, + "_target_": "gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDMConfig", + "action_dim": 32, + "action_horizon": 16, + "add_pos_embed": true, + "add_seperator_token": true, + "add_view_embed": true, + "backbone_features_projector_cfg": null, + "diffusion_model_cfg": { + "_target_": "gr00t.model.action_head.cross_attention_dit.DiT", + "attention_head_dim": 64, + "dropout": 0.2, + "final_dropout": true, + "interleave_self_attention": true, + "norm_type": "ada_norm", + "num_attention_heads": 16, + "num_layers": 8, + "output_dim": 1024, + "positional_embeddings": null + }, + "hidden_size": 1024, + "max_action_dim": 32, + "max_num_views": 6, + "max_state_dim": 44, + "mm_projector_cfg": { + "_convert_": "object", + "_target_": "gr00t.model.action_head.multimodal_projector.MultimodalProjector", + "config": { + "_target_": "gr00t.model.action_head.multimodal_projector.MultimodalProjectorConfig", + "hidden_size": 1024, + "mm_hidden_size": 1024, + "mm_projector_type": "mlp_doubledownsample" + } + }, + "mm_vision_select_layer": -2, + "model_dtype": "float32", + "noise_beta_alpha": 1.5, + "noise_beta_beta": 1.0, + "noise_s": 0.999, + "num_inference_timesteps": 16, + "num_timestep_buckets": 1000, + "siglip_hidden_size": 1024, + "siglip_model_cfg": { + "_convert_": "object", + "_target_": "gr00t.model.action_head.siglip.SiglipModel.from_pretrained", + "pretrained_model_name_or_path": "google/siglip2-large-patch16-256" + }, + "tune_vision_tower": true, + "vl_self_attention_cfg": { + "_target_": "gr00t.model.action_head.cross_attention_dit.SelfAttentionTransformer", + "attention_head_dim": 64, + "dropout": 0.2, + "final_dropout": true, + "num_attention_heads": 16, + "num_layers": 4, + "positional_embeddings": null + } + } + }, + "action_horizon": 16, + "architectures": [ + "DualBrain" + ], + "backbone_cfg": { + "_target_": "gr00t.model.backbone.IdentityBackbone" + }, + "hidden_size": 0, + "model_dtype": "float32", + "model_type": "dual_brain", + "resume_path": "/mnt/amlfs-01/home/seonghyeony/checkpoints/groot_s_idm_24P_300", + "torch_dtype": "bfloat16", + "transformers_version": "4.45.2" +} diff --git a/experiment_cfg/conf.yaml b/experiment_cfg/conf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..238614bd8a22bb1d99338389fbbc124be92ab7fe --- /dev/null +++ b/experiment_cfg/conf.yaml @@ -0,0 +1,6962 @@ +model: + _target_: gr00t.model.idm.IDM + _convert_: object + config: + _target_: gr00t.model.idm.IDMConfig + _recursive_: false + model_dtype: float32 + hidden_size: 0 + action_horizon: 16 + action_dim: 32 + backbone_cfg: + _target_: gr00t.model.backbone.IdentityBackbone + action_head_cfg: + _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDM + _convert_: object + config: + _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDMConfig + _recursive_: false + add_seperator_token: true + add_pos_embed: true + model_dtype: float32 + mm_vision_select_layer: -2 + max_state_dim: 44 + max_action_dim: 32 + hidden_size: 1024 + tune_vision_tower: true + add_view_embed: true + max_num_views: 6 + siglip_model_cfg: + _target_: gr00t.model.action_head.siglip.SiglipModel.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + siglip_hidden_size: 1024 + vl_self_attention_cfg: + _target_: gr00t.model.action_head.cross_attention_dit.SelfAttentionTransformer + positional_embeddings: null + num_layers: 4 + num_attention_heads: 16 + attention_head_dim: 64 + dropout: 0.2 + final_dropout: true + diffusion_model_cfg: + _target_: gr00t.model.action_head.cross_attention_dit.DiT + positional_embeddings: null + num_layers: 8 + num_attention_heads: 16 + attention_head_dim: 64 + norm_type: ada_norm + dropout: 0.2 + final_dropout: true + output_dim: 1024 + interleave_self_attention: true + mm_projector_cfg: + _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjector + _convert_: object + config: + _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjectorConfig + hidden_size: 1024 + mm_hidden_size: 1024 + mm_projector_type: mlp_doubledownsample + action_dim: 32 + action_horizon: 16 + num_inference_timesteps: 16 + noise_beta_alpha: 1.5 + noise_beta_beta: 1.0 + noise_s: 0.999 + num_timestep_buckets: 1000 + backbone_features_projector_cfg: null +train_dataset: + _target_: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec + _convert_: object + mixture_spec: + - dataset_path: + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CloseDoubleDoor256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CloseDrawer256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CloseSingleDoor256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CoffeePressButton256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CoffeeServeMug256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.CoffeeSetupMug256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.OpenDoubleDoor256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.OpenDrawer256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.OpenSingleDoor256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCabToCounter256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToCab256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToMicrowave256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToSink256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPCounterToStove256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPMicrowaveToCounter256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPSinkToCounter256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.PnPStoveToCounter256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOffMicrowave256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOffSinkFaucet256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOffStove256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOnMicrowave256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOnSinkFaucet256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnOnStove256_300 + - /mnt/amlfs-02/shared/datasets/posttrain/24P/robocasa_panda_omron.TurnSinkSpout256_300 + dataset_weight: 1.0 + dataset_class: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotSingleDataset + all_modality_configs: + robocasa_gr1_arms_only_fourier_hands: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.ego_view_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + robocasa_gr1_arms_waist_fourier_hands: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.ego_view_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + robocasa_gr1_fixed_lower_body_fourier_hands: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.agentview_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + robocasa_bimanual_panda_parallel_gripper: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + robocasa_bimanual_panda_inspire_hand: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + robocasa_panda_omron: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.left_view + - video.right_view + - video.wrist_view + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + gr1_unified: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.ego_view_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.coarse_action + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + oxe_droid: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.eef_position + - state.eef_rotation + - state.gripper_position + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.language_instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + oxe_fractal: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.image_pad_res256_freq03 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.world_vector + - action.rotation_delta + - action.gripper_position + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.natural_language_instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + oxe_language_table: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.rgb_pad_res256_freq10 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.effector_translation + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.action + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + oxe_bridge: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.image_0 + - video.image_1 + - video.image_2 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.eef_position + - action.eef_rotation + - action.gripper_position + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.language_instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + agibot: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.top_head + - video.hand_left + - video.hand_right + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.agibot.task_description + all_transforms: + robocasa_gr1_arms_only_fourier_hands: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.ego_view_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.ego_view_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.ego_view_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + normalization_modes: + state.left_arm: min_max + state.right_arm: min_max + state.left_hand: min_max + state.right_hand: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + normalization_modes: + action.right_arm: min_max + action.left_arm: min_max + action.right_hand: min_max + action.left_hand: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.ego_view_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + robocasa_gr1_arms_waist_fourier_hands: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.ego_view_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.ego_view_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.ego_view_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + normalization_modes: + state.left_arm: min_max + state.right_arm: min_max + state.left_hand: min_max + state.right_hand: min_max + state.waist: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + normalization_modes: + action.right_arm: min_max + action.left_arm: min_max + action.right_hand: min_max + action.left_hand: min_max + action.waist: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.ego_view_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + robocasa_gr1_fixed_lower_body_fourier_hands: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.agentview_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.agentview_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.agentview_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + normalization_modes: + state.left_arm: min_max + state.right_arm: min_max + state.left_hand: min_max + state.right_hand: min_max + state.waist: min_max + state.neck: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + normalization_modes: + action.right_arm: min_max + action.left_arm: min_max + action.right_hand: min_max + action.left_hand: min_max + action.waist: min_max + action.neck: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.agentview_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + robocasa_bimanual_panda_parallel_gripper: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + normalization_modes: + state.right_arm_eef_pos: min_max + state.right_gripper_qpos: min_max + state.left_arm_eef_pos: min_max + state.left_gripper_qpos: min_max + target_rotations: + state.right_arm_eef_quat: rotation_6d + state.left_arm_eef_quat: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + normalization_modes: + action.right_gripper_close: binary + action.left_gripper_close: binary + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state_concat_order: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + action_concat_order: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + robocasa_bimanual_panda_inspire_hand: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + normalization_modes: + state.right_arm_eef_pos: min_max + state.right_hand: min_max + state.left_arm_eef_pos: min_max + state.left_hand: min_max + target_rotations: + state.right_arm_eef_quat: rotation_6d + state.left_arm_eef_quat: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + normalization_modes: + action.right_hand: min_max + action.left_hand: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state_concat_order: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + action_concat_order: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + robocasa_panda_omron: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + normalization_modes: + state.end_effector_position_relative: min_max + state.end_effector_rotation_relative: min_max + state.gripper_qpos: min_max + state.base_position: min_max + state.base_rotation: min_max + target_rotations: + state.end_effector_rotation_relative: rotation_6d + state.base_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + normalization_modes: + action.end_effector_position: min_max + action.end_effector_rotation: min_max + action.gripper_close: binary + action.base_motion: min_max + action.control_mode: binary + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.left_view + - video.right_view + - video.wrist_view + state_concat_order: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + action_concat_order: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + gr1_unified: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.ego_view_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.ego_view_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.ego_view_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + normalization_modes: + state.left_arm: scale + state.right_arm: scale + state.left_hand: scale + state.right_hand: scale + state.waist: scale + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + normalization_modes: + action.left_arm: scale + action.right_arm: scale + action.left_hand: scale + action.right_hand: scale + action.waist: scale + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.ego_view_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + oxe_droid: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_position + normalization_modes: + state.eef_position: min_max + state.gripper_position: min_max + target_rotations: + state.eef_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + normalization_modes: + action.gripper_position: binary + target_rotations: + action.eef_rotation_delta: axis_angle + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + state_concat_order: + - state.eef_position + - state.eef_rotation + - state.gripper_position + action_concat_order: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + oxe_fractal: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.image_pad_res256_freq03 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.image_pad_res256_freq03 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.image_pad_res256_freq03 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.image_pad_res256_freq03 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.image_pad_res256_freq03 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + normalization_modes: + state.eef_position: min_max + state.gripper_closedness_commanded: min_max + target_rotations: + state.eef_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.world_vector + - action.rotation_delta + - action.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.world_vector + - action.rotation_delta + - action.gripper_position + normalization_modes: + action.gripper_position: binary + target_rotations: + action.rotation_delta: axis_angle + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.image_pad_res256_freq03 + state_concat_order: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + action_concat_order: + - action.world_vector + - action.rotation_delta + - action.gripper_position + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + oxe_language_table: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.rgb_pad_res256_freq10 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.rgb_pad_res256_freq10 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.rgb_pad_res256_freq10 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.rgb_pad_res256_freq10 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.rgb_pad_res256_freq10 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.effector_translation + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.effector_translation + normalization_modes: + state.effector_translation: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.action + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.action + normalization_modes: + action.action: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.rgb_pad_res256_freq10 + state_concat_order: + - state.effector_translation + action_concat_order: + - action.action + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + oxe_bridge: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + normalization_modes: + state.eef_position: min_max + state.gripper_closed: min_max + target_rotations: + state.eef_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.eef_position + - action.eef_rotation + - action.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.eef_position + - action.eef_rotation + - action.gripper_position + normalization_modes: + action.gripper_position: binary + target_rotations: + action.eef_rotation: axis_angle + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.image_0 + - video.image_1 + - video.image_2 + state_concat_order: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + action_concat_order: + - action.eef_position + - action.eef_rotation + - action.gripper_position + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + agibot: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + normalization_modes: + state.left_arm_joint_position: min_max + state.right_arm_joint_position: min_max + state.left_effector_position: min_max + state.right_effector_position: min_max + state.head_position: min_max + state.waist_position: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + normalization_modes: + action.left_arm_joint_position: min_max + action.right_arm_joint_position: min_max + action.left_effector_position: min_max + action.right_effector_position: min_max + action.head_position: min_max + action.waist_position: min_max + action.robot_velocity: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.top_head + - video.hand_left + - video.hand_right + state_concat_order: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + action_concat_order: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + metadata_versions: + robocasa_gr1_arms_only_fourier_hands: '0217' + robocasa_gr1_fixed_lower_body_fourier_hands: '0217' + robocasa_bimanual_panda_parallel_gripper: '0217' + robocasa_bimanual_panda_inspire_hand: '0217' + robocasa_panda_omron: '0217' + gr1_unified: '0225' + oxe_droid: '0221' + oxe_fractal: '0221' + oxe_language_table: '0221' + oxe_bridge: '0221' + robocasa_gr1_arms_waist_fourier_hands: '0225' + agibot: '0225' + dataset_kwargs: + video_backend: decord + mixture_kwargs: + training: true + balance_dataset_weights: true + seed: 42 +trainer: + _target_: gr00t.experiment.dual_brain.experiment.DualBrainTrainer + _partial_: true + _recursive_: false + callbacks: null + model: ??? + train_dataset: ??? + compute_dtype: ??? + benchmark_time: false + enable_profiling: false + profiling_steps: 5 +wandb_project: dream_idm +output_dir: /mnt/amlfs-01/home/seonghyeony/checkpoints/gr00t_s_idm_24P_300 +load_from_yaml: null +gear_credentials: /mnt/amlfs-01/home/seonghyeony/.gear/data_credentials +upload_checkpoints: false +upload_every: 10000 +upload_last_n_checkpoints: 5 +remove_unused_columns: false +bf16: true +tf32: true +global_batch_size: 1024 +raise_error_if_global_batch_size_not_set: true +per_device_train_batch_size: 32 +per_device_eval_batch_size: 64 +gradient_accumulation_steps: 1 +dataloader_num_workers: 6 +dataloader_pin_memory: false +dataloader_persistent_workers: true +optim: adamw_torch +learning_rate: 0.0001 +adam_beta1: 0.95 +adam_beta2: 0.999 +adam_epsilon: 1.0e-08 +weight_decay: 1.0e-05 +lr_scheduler_type: cosine +warmup_ratio: 0.05 +logging_steps: 10.0 +num_train_epochs: 1000 +max_steps: 60000 +save_strategy: steps +save_steps: 1000 +eval_strategy: 'no' +save_total_limit: 20 +report_to: wandb +seed: 42 +do_eval: false +gradient_checkpointing: false +ddp_find_unused_parameters: false +ddp_bucket_cap_mb: 100 +ray_num_workers: 32 +eval_bf16: true +torch_compile_mode: null +pretrained_model_path: null +only_tune_projectors: false +training_args: + _target_: transformers.TrainingArguments + output_dir: /mnt/amlfs-01/home/seonghyeony/checkpoints/gr00t_s_idm_24P_300 + run_name: gr00t_s_idm_24P_300 + remove_unused_columns: false + deepspeed: gr00t/gr00t/experiment/dual_brain/configs/deepspeed/zero2.json + gradient_checkpointing: false + bf16: true + tf32: true + per_device_train_batch_size: 32 + per_device_eval_batch_size: 64 + gradient_accumulation_steps: 1 + dataloader_num_workers: 6 + dataloader_pin_memory: false + dataloader_persistent_workers: true + optim: adamw_torch + adam_beta1: 0.95 + adam_beta2: 0.999 + adam_epsilon: 1.0e-08 + learning_rate: 0.0001 + weight_decay: 1.0e-05 + warmup_ratio: 0.05 + lr_scheduler_type: cosine + logging_steps: 10.0 + num_train_epochs: 1000 + max_steps: 60000 + save_strategy: steps + save_steps: 1000 + save_total_limit: 20 + report_to: wandb + seed: 42 + do_eval: false + ddp_find_unused_parameters: false + ddp_bucket_cap_mb: 100 + torch_compile_mode: null +add_seperator_token: true +add_pos_embed: true +hidden_size: 1024 +attn_dropout: 0.2 +siglip_hidden_size: 1024 +siglip_version: google/siglip2-large-patch16-256 +action_head_cfg: + _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDM + _convert_: object + config: + _target_: gr00t.model.action_head.flow_matching_action_head_idm.FlowMatchingActionHeadIDMConfig + _recursive_: false + add_seperator_token: true + add_pos_embed: true + model_dtype: float32 + mm_vision_select_layer: -2 + max_state_dim: 44 + max_action_dim: 32 + hidden_size: 1024 + tune_vision_tower: true + add_view_embed: true + max_num_views: 6 + siglip_model_cfg: + _target_: gr00t.model.action_head.siglip.SiglipModel.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + siglip_hidden_size: 1024 + vl_self_attention_cfg: + _target_: gr00t.model.action_head.cross_attention_dit.SelfAttentionTransformer + positional_embeddings: null + num_layers: 4 + num_attention_heads: 16 + attention_head_dim: 64 + dropout: 0.2 + final_dropout: true + diffusion_model_cfg: + _target_: gr00t.model.action_head.cross_attention_dit.DiT + positional_embeddings: null + num_layers: 8 + num_attention_heads: 16 + attention_head_dim: 64 + norm_type: ada_norm + dropout: 0.2 + final_dropout: true + output_dim: 1024 + interleave_self_attention: true + mm_projector_cfg: + _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjector + _convert_: object + config: + _target_: gr00t.model.action_head.multimodal_projector.MultimodalProjectorConfig + hidden_size: 1024 + mm_hidden_size: 1024 + mm_projector_type: mlp_doubledownsample + action_dim: 32 + action_horizon: 16 + num_inference_timesteps: 16 + noise_beta_alpha: 1.5 + noise_beta_beta: 1.0 + noise_s: 0.999 + num_timestep_buckets: 1000 + backbone_features_projector_cfg: null +backbone_hidden_size: 0 +backbone_cfg: + _target_: gr00t.model.backbone.IdentityBackbone +embodiment_tag_to_projector_index: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +num_visual_tokens_per_frame: 16 +max_action_dim: 32 +language_dropout_prob: 0.0 +model_image_resolution: 224 +max_sequence_length: 112 +model_specific_transform: + _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +data_collator: + _target_: gr00t.model.transforms_idm.DefaultDataCollatorGR00TIDM +action_horizon: 16 +totensor_cfg: + _target_: gr00t.data.transform.VideoToTensor + apply_to: ??? +crop_cfg: + _target_: gr00t.data.transform.VideoCrop + apply_to: ??? + scale: 0.95 + mode: random +resize_cfg: + _target_: gr00t.data.transform.VideoResize + apply_to: ??? + height: 224 + width: 224 + interpolation: linear +color_jitter_cfg: + _target_: gr00t.data.transform.VideoColorJitter + apply_to: ??? + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 +to_numpy_cfg: + _target_: gr00t.data.transform.VideoToNumpy + apply_to: ??? +modality_config_robocasa_gr1_arms_only_fourier_hands: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.ego_view_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action +transform_robocasa_gr1_arms_only_fourier_hands: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.ego_view_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.ego_view_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.ego_view_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + normalization_modes: + state.left_arm: min_max + state.right_arm: min_max + state.left_hand: min_max + state.right_hand: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + normalization_modes: + action.right_arm: min_max + action.left_arm: min_max + action.right_hand: min_max + action.left_hand: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.ego_view_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_config_robocasa_gr1_arms_waist_fourier_hands: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.ego_view_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action +transform_robocasa_gr1_arms_waist_fourier_hands: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.ego_view_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.ego_view_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.ego_view_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + normalization_modes: + state.left_arm: min_max + state.right_arm: min_max + state.left_hand: min_max + state.right_hand: min_max + state.waist: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + normalization_modes: + action.right_arm: min_max + action.left_arm: min_max + action.right_hand: min_max + action.left_hand: min_max + action.waist: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.ego_view_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_config_robocasa_panda_omron: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.left_view + - video.right_view + - video.wrist_view + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action +transform_robocasa_panda_omron: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + normalization_modes: + state.end_effector_position_relative: min_max + state.end_effector_rotation_relative: min_max + state.gripper_qpos: min_max + state.base_position: min_max + state.base_rotation: min_max + target_rotations: + state.end_effector_rotation_relative: rotation_6d + state.base_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + normalization_modes: + action.end_effector_position: min_max + action.end_effector_rotation: min_max + action.gripper_close: binary + action.base_motion: min_max + action.control_mode: binary + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.left_view + - video.right_view + - video.wrist_view + state_concat_order: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + action_concat_order: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_config_robocasa_gr1_fixed_lower_body_fourier_hands: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.agentview_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action +transform_robocasa_gr1_fixed_lower_body_fourier_hands: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.agentview_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.agentview_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.agentview_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + normalization_modes: + state.left_arm: min_max + state.right_arm: min_max + state.left_hand: min_max + state.right_hand: min_max + state.waist: min_max + state.neck: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + normalization_modes: + action.right_arm: min_max + action.left_arm: min_max + action.right_hand: min_max + action.left_hand: min_max + action.waist: min_max + action.neck: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.agentview_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_config_robocasa_bimanual_panda_parallel_gripper: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action +transform_robocasa_bimanual_panda_parallel_gripper: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + normalization_modes: + state.right_arm_eef_pos: min_max + state.right_gripper_qpos: min_max + state.left_arm_eef_pos: min_max + state.left_gripper_qpos: min_max + target_rotations: + state.right_arm_eef_quat: rotation_6d + state.left_arm_eef_quat: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + normalization_modes: + action.right_gripper_close: binary + action.left_gripper_close: binary + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state_concat_order: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + action_concat_order: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_config_robocasa_bimanual_panda_inspire_hand: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action +transform_robocasa_bimanual_panda_inspire_hand: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + normalization_modes: + state.right_arm_eef_pos: min_max + state.right_hand: min_max + state.left_arm_eef_pos: min_max + state.left_hand: min_max + target_rotations: + state.right_arm_eef_quat: rotation_6d + state.left_arm_eef_quat: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + normalization_modes: + action.right_hand: min_max + action.left_hand: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state_concat_order: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + action_concat_order: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_config_gr1_unified: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.ego_view_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.coarse_action + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action +transform_gr1_unified: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.ego_view_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.ego_view_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.ego_view_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + normalization_modes: + state.left_arm: scale + state.right_arm: scale + state.left_hand: scale + state.right_hand: scale + state.waist: scale + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + normalization_modes: + action.left_arm: scale + action.right_arm: scale + action.left_hand: scale + action.right_hand: scale + action.waist: scale + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.ego_view_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_config_oxe_droid: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.eef_position + - state.eef_rotation + - state.gripper_position + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.language_instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action +transform_oxe_droid: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_position + normalization_modes: + state.eef_position: min_max + state.gripper_position: min_max + target_rotations: + state.eef_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + normalization_modes: + action.gripper_position: binary + target_rotations: + action.eef_rotation_delta: axis_angle + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + state_concat_order: + - state.eef_position + - state.eef_rotation + - state.gripper_position + action_concat_order: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_config_oxe_fractal: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.image_pad_res256_freq03 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.world_vector + - action.rotation_delta + - action.gripper_position + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.natural_language_instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action +transform_oxe_fractal: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.image_pad_res256_freq03 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.image_pad_res256_freq03 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.image_pad_res256_freq03 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.image_pad_res256_freq03 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.image_pad_res256_freq03 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + normalization_modes: + state.eef_position: min_max + state.gripper_closedness_commanded: min_max + target_rotations: + state.eef_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.world_vector + - action.rotation_delta + - action.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.world_vector + - action.rotation_delta + - action.gripper_position + normalization_modes: + action.gripper_position: binary + target_rotations: + action.rotation_delta: axis_angle + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.image_pad_res256_freq03 + state_concat_order: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + action_concat_order: + - action.world_vector + - action.rotation_delta + - action.gripper_position + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_config_oxe_language_table: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.rgb_pad_res256_freq10 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.effector_translation + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.action + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action +transform_oxe_language_table: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.rgb_pad_res256_freq10 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.rgb_pad_res256_freq10 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.rgb_pad_res256_freq10 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.rgb_pad_res256_freq10 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.rgb_pad_res256_freq10 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.effector_translation + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.effector_translation + normalization_modes: + state.effector_translation: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.action + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.action + normalization_modes: + action.action: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.rgb_pad_res256_freq10 + state_concat_order: + - state.effector_translation + action_concat_order: + - action.action + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_config_oxe_bridge: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.image_0 + - video.image_1 + - video.image_2 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.eef_position + - action.eef_rotation + - action.gripper_position + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.language_instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action +transform_oxe_bridge: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + normalization_modes: + state.eef_position: min_max + state.gripper_closed: min_max + target_rotations: + state.eef_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.eef_position + - action.eef_rotation + - action.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.eef_position + - action.eef_rotation + - action.gripper_position + normalization_modes: + action.gripper_position: binary + target_rotations: + action.eef_rotation: axis_angle + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.image_0 + - video.image_1 + - video.image_2 + state_concat_order: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + action_concat_order: + - action.eef_position + - action.eef_rotation + - action.gripper_position + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_config_agibot: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.top_head + - video.hand_left + - video.hand_right + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.agibot.task_description +transform_agibot: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + normalization_modes: + state.left_arm_joint_position: min_max + state.right_arm_joint_position: min_max + state.left_effector_position: min_max + state.right_effector_position: min_max + state.head_position: min_max + state.waist_position: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + normalization_modes: + action.left_arm_joint_position: min_max + action.right_arm_joint_position: min_max + action.left_effector_position: min_max + action.right_effector_position: min_max + action.head_position: min_max + action.waist_position: min_max + action.robot_velocity: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.top_head + - video.hand_left + - video.hand_right + state_concat_order: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + action_concat_order: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +modality_configs: + robocasa_gr1_arms_only_fourier_hands: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.ego_view_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + robocasa_gr1_arms_waist_fourier_hands: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.ego_view_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + robocasa_gr1_fixed_lower_body_fourier_hands: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.agentview_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + robocasa_bimanual_panda_parallel_gripper: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + robocasa_bimanual_panda_inspire_hand: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + robocasa_panda_omron: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.left_view + - video.right_view + - video.wrist_view + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.action.task_description + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + gr1_unified: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.ego_view_pad_res256_freq20 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.human.coarse_action + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + oxe_droid: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.eef_position + - state.eef_rotation + - state.gripper_position + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.language_instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + oxe_fractal: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.image_pad_res256_freq03 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.world_vector + - action.rotation_delta + - action.gripper_position + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.natural_language_instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + oxe_language_table: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.rgb_pad_res256_freq10 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.effector_translation + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.action + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + oxe_bridge: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.image_0 + - video.image_1 + - video.image_2 + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.eef_position + - action.eef_rotation + - action.gripper_position + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.language.language_instruction + lapa_action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - lapa_action + agibot: + video: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 16 + modality_keys: + - video.top_head + - video.hand_left + - video.hand_right + state: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + action: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + modality_keys: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + language: + _target_: gr00t.data.dataset.ModalityConfig + delta_indices: + - 0 + modality_keys: + - annotation.agibot.task_description +transforms: + robocasa_gr1_arms_only_fourier_hands: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.ego_view_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.ego_view_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.ego_view_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + normalization_modes: + state.left_arm: min_max + state.right_arm: min_max + state.left_hand: min_max + state.right_hand: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + normalization_modes: + action.right_arm: min_max + action.left_arm: min_max + action.right_hand: min_max + action.left_hand: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.ego_view_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + robocasa_gr1_arms_waist_fourier_hands: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.ego_view_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.ego_view_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.ego_view_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + normalization_modes: + state.left_arm: min_max + state.right_arm: min_max + state.left_hand: min_max + state.right_hand: min_max + state.waist: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + normalization_modes: + action.right_arm: min_max + action.left_arm: min_max + action.right_hand: min_max + action.left_hand: min_max + action.waist: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.ego_view_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + robocasa_gr1_fixed_lower_body_fourier_hands: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.agentview_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.agentview_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.agentview_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + normalization_modes: + state.left_arm: min_max + state.right_arm: min_max + state.left_hand: min_max + state.right_hand: min_max + state.waist: min_max + state.neck: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + normalization_modes: + action.right_arm: min_max + action.left_arm: min_max + action.right_hand: min_max + action.left_hand: min_max + action.waist: min_max + action.neck: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.agentview_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - state.neck + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - action.neck + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + robocasa_bimanual_panda_parallel_gripper: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + normalization_modes: + state.right_arm_eef_pos: min_max + state.right_gripper_qpos: min_max + state.left_arm_eef_pos: min_max + state.left_gripper_qpos: min_max + target_rotations: + state.right_arm_eef_quat: rotation_6d + state.left_arm_eef_quat: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + normalization_modes: + action.right_gripper_close: binary + action.left_gripper_close: binary + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state_concat_order: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_gripper_qpos + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_gripper_qpos + action_concat_order: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_gripper_close + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_gripper_close + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + robocasa_bimanual_panda_inspire_hand: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + normalization_modes: + state.right_arm_eef_pos: min_max + state.right_hand: min_max + state.left_arm_eef_pos: min_max + state.left_hand: min_max + target_rotations: + state.right_arm_eef_quat: rotation_6d + state.left_arm_eef_quat: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + normalization_modes: + action.right_hand: min_max + action.left_hand: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.robot0_eye_in_hand_pad_res256_freq20 + - video.robot1_eye_in_hand_pad_res256_freq20 + - video.agentview_pad_res256_freq20 + state_concat_order: + - state.right_arm_eef_pos + - state.right_arm_eef_quat + - state.right_hand + - state.left_arm_eef_pos + - state.left_arm_eef_quat + - state.left_hand + action_concat_order: + - action.right_arm_eef_pos + - action.right_arm_eef_rot + - action.right_hand + - action.left_arm_eef_pos + - action.left_arm_eef_rot + - action.left_hand + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + robocasa_panda_omron: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.left_view + - video.right_view + - video.wrist_view + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + normalization_modes: + state.end_effector_position_relative: min_max + state.end_effector_rotation_relative: min_max + state.gripper_qpos: min_max + state.base_position: min_max + state.base_rotation: min_max + target_rotations: + state.end_effector_rotation_relative: rotation_6d + state.base_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + normalization_modes: + action.end_effector_position: min_max + action.end_effector_rotation: min_max + action.gripper_close: binary + action.base_motion: min_max + action.control_mode: binary + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.left_view + - video.right_view + - video.wrist_view + state_concat_order: + - state.end_effector_position_relative + - state.end_effector_rotation_relative + - state.gripper_qpos + - state.base_position + - state.base_rotation + action_concat_order: + - action.end_effector_position + - action.end_effector_rotation + - action.gripper_close + - action.base_motion + - action.control_mode + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + gr1_unified: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.ego_view_pad_res256_freq20 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.ego_view_pad_res256_freq20 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.ego_view_pad_res256_freq20 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.ego_view_pad_res256_freq20 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + normalization_modes: + state.left_arm: scale + state.right_arm: scale + state.left_hand: scale + state.right_hand: scale + state.waist: scale + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + normalization_modes: + action.left_arm: scale + action.right_arm: scale + action.left_hand: scale + action.right_hand: scale + action.waist: scale + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.ego_view_pad_res256_freq20 + state_concat_order: + - state.left_arm + - state.right_arm + - state.left_hand + - state.right_hand + - state.waist + action_concat_order: + - action.left_arm + - action.right_arm + - action.left_hand + - action.right_hand + - action.waist + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + oxe_droid: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_position + normalization_modes: + state.eef_position: min_max + state.gripper_position: min_max + target_rotations: + state.eef_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + normalization_modes: + action.gripper_position: binary + target_rotations: + action.eef_rotation_delta: axis_angle + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.exterior_image_1_left_pad_res256_freq15 + - video.exterior_image_2_left_pad_res256_freq15 + - video.wrist_image_left_pad_res256_freq15 + state_concat_order: + - state.eef_position + - state.eef_rotation + - state.gripper_position + action_concat_order: + - action.eef_position_delta + - action.eef_rotation_delta + - action.gripper_position + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + oxe_fractal: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.image_pad_res256_freq03 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.image_pad_res256_freq03 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.image_pad_res256_freq03 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.image_pad_res256_freq03 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.image_pad_res256_freq03 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + normalization_modes: + state.eef_position: min_max + state.gripper_closedness_commanded: min_max + target_rotations: + state.eef_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.world_vector + - action.rotation_delta + - action.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.world_vector + - action.rotation_delta + - action.gripper_position + normalization_modes: + action.gripper_position: binary + target_rotations: + action.rotation_delta: axis_angle + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.image_pad_res256_freq03 + state_concat_order: + - state.eef_position + - state.eef_rotation + - state.gripper_closedness_commanded + action_concat_order: + - action.world_vector + - action.rotation_delta + - action.gripper_position + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + oxe_language_table: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.rgb_pad_res256_freq10 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.rgb_pad_res256_freq10 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.rgb_pad_res256_freq10 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.rgb_pad_res256_freq10 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.rgb_pad_res256_freq10 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.effector_translation + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.effector_translation + normalization_modes: + state.effector_translation: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.action + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.action + normalization_modes: + action.action: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.rgb_pad_res256_freq10 + state_concat_order: + - state.effector_translation + action_concat_order: + - action.action + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + oxe_bridge: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.image_0 + - video.image_1 + - video.image_2 + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + normalization_modes: + state.eef_position: min_max + state.gripper_closed: min_max + target_rotations: + state.eef_rotation: rotation_6d + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.eef_position + - action.eef_rotation + - action.gripper_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.eef_position + - action.eef_rotation + - action.gripper_position + normalization_modes: + action.gripper_position: binary + target_rotations: + action.eef_rotation: axis_angle + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.image_0 + - video.image_1 + - video.image_2 + state_concat_order: + - state.eef_position + - state.eef_rotation + - state.gripper_closed + action_concat_order: + - action.eef_position + - action.eef_rotation + - action.gripper_position + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 + agibot: + _target_: gr00t.data.transform.ComposedModalityTransform + transforms: + - _target_: gr00t.data.transform.VideoToTensor + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + - _target_: gr00t.data.transform.VideoCrop + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + scale: 0.95 + mode: random + - _target_: gr00t.data.transform.VideoResize + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + height: 224 + width: 224 + interpolation: linear + - _target_: gr00t.data.transform.VideoColorJitter + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + brightness: 0.3 + contrast: 0.4 + saturation: 0.5 + hue: 0.08 + - _target_: gr00t.data.transform.VideoToNumpy + apply_to: + - video.top_head + - video.hand_left + - video.hand_right + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + normalization_modes: + state.left_arm_joint_position: min_max + state.right_arm_joint_position: min_max + state.left_effector_position: min_max + state.right_effector_position: min_max + state.head_position: min_max + state.waist_position: min_max + - _target_: gr00t.data.transform.StateActionToTensor + apply_to: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + - _target_: gr00t.data.transform.StateActionTransform + apply_to: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + normalization_modes: + action.left_arm_joint_position: min_max + action.right_arm_joint_position: min_max + action.left_effector_position: min_max + action.right_effector_position: min_max + action.head_position: min_max + action.waist_position: min_max + action.robot_velocity: min_max + - _target_: gr00t.data.transform.ConcatTransform + video_concat_order: + - video.top_head + - video.hand_left + - video.hand_right + state_concat_order: + - state.left_arm_joint_position + - state.right_arm_joint_position + - state.left_effector_position + - state.right_effector_position + - state.head_position + - state.waist_position + action_concat_order: + - action.left_arm_joint_position + - action.right_arm_joint_position + - action.left_effector_position + - action.right_effector_position + - action.head_position + - action.waist_position + - action.robot_velocity + - _target_: gr00t.model.transforms_idm.GR00TIDMTransform + default_instruction: Perform the default behavior. + num_visual_tokens_per_frame: 16 + max_num_images_per_sequence: 6 + max_action_dim: 32 + max_sequence_length: 112 + action_horizon: 16 + siglip_processor: + _target_: gr00t.model.action_head.siglip.SiglipProcessor.from_pretrained + _convert_: object + pretrained_model_name_or_path: google/siglip2-large-patch16-256 + embodiment_tag_mapping: + real_gr1_arms_only: 0 + real_gr1_arms_only_annotated: 1 + real_gr1_arms_waist: 2 + real_gr1_arms_waist_annotated: 3 + dexmg_gr1_arms_only_inspire: 4 + dexmg_gr1_arms_only_fourier: 5 + dexmg_gr1_arms_waist_fourier: 6 + robocasa_single_arm: 7 + onex_eve_gripper: 8 + robocasa_gr1_arms_only_inspire_hands: 9 + robocasa_gr1_arms_only_fourier_hands: 10 + robocasa_gr1_fixed_lower_body_inspire_hands: 11 + robocasa_gr1_fixed_lower_body_fourier_hands: 12 + robocasa_panda_omron: 13 + robocasa_single_arm_panda_omron: 14 + robocasa_bimanual_panda_parallel_gripper: 15 + robocasa_bimanual_panda_inspire_hand: 16 + oxe_droid: 17 + oxe_fractal: 18 + oxe_language_table: 19 + oxe_bridge: 20 + real_panda_single_arm: 21 + unknown: 22 + hot3d_hands_only: 23 + gr1_unified: 24 + robocasa_gr1_arms_waist_fourier_hands: 25 + agibot: 26 + lapa: 27 + oxe_mutex: 28 + oxe_roboset: 29 + oxe_plex: 30 + dream: 31 +metadata_versions: + robocasa_gr1_arms_only_fourier_hands: '0217' + robocasa_gr1_fixed_lower_body_fourier_hands: '0217' + robocasa_bimanual_panda_parallel_gripper: '0217' + robocasa_bimanual_panda_inspire_hand: '0217' + robocasa_panda_omron: '0217' + gr1_unified: '0225' + oxe_droid: '0221' + oxe_fractal: '0221' + oxe_language_table: '0221' + oxe_bridge: '0221' + robocasa_gr1_arms_waist_fourier_hands: '0225' + agibot: '0225' +dataset_path: ??? +max_state_dim: 44 +mixture_dataset_cls: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec +single_dataset_cls: gr00t.data.dataset.lerobot_sharded.ShardedLeRobotSingleDataset +data_root: /mnt/amlfs-02/shared/datasets +gr00t_commit_hash: 16d97a65f0541e14efa958455542c5ae3ad9607f +total_training_steps: 163840000000 diff --git a/experiment_cfg/metadata.json b/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..81ba189d6faf307e4cd44b092bdc62269804e221 --- /dev/null +++ b/experiment_cfg/metadata.json @@ -0,0 +1,826 @@ +{ + "robocasa_panda_omron": { + "dataset_name": "robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet, robocasa_panda_omron:FullSet", + "dataset_statistics": { + "state": { + "base_position": { + "max": [ + 5.60412073135376, + -0.4872598946094513, + 0.8319682478904724 + ], + "min": [ + 0.31250813603401184, + -5.110836982727051, + 0.6947912573814392 + ], + "mean": [ + 2.5233612060546875, + -1.4497382640838627, + 0.7089967727661134 + ], + "std": [ + 1.4186391830444336, + 0.8250066041946406, + 0.008732615038746 + ], + "q01": [ + 0.5000049471855164, + -4.100456237792969, + 0.6999498009681703 + ], + "q99": [ + 5.5999813079834, + -0.7786697745323181, + 0.7069445848464966 + ] + }, + "base_rotation": { + "max": [ + 0.0, + 0.0, + 1.0, + 1.0 + ], + "min": [ + 0.0, + 0.0, + -1.0, + 0.0 + ], + "mean": [ + 0.0, + 0.0, + 0.25808697938919073, + 0.6459704637527465 + ], + "std": [ + 0.0, + 0.0, + 0.5877522826194763, + 0.3639818727970123 + ], + "q01": [ + 0.0, + 0.0, + -0.9999999999999999, + 5.5698886990285226e-06 + ], + "q99": [ + 0.0, + 0.0, + 0.9999999999999999, + 0.9999999999999999 + ] + }, + "end_effector_position_absolute": { + "max": [ + 6.200742721557617, + -0.00844558235257864, + 1.8568346500396729 + ], + "min": [ + 0.011350978165864944, + -5.129131317138672, + 0.6468141078948975 + ], + "mean": [ + 2.5114300251007076, + -1.360061168670654, + 1.339444875717163 + ], + "std": [ + 1.5792435407638563, + 0.9657800793647767, + 0.25533461570739835 + ], + "q01": [ + 0.15726516664028167, + -4.127151670455933, + 0.7751663839817047 + ], + "q99": [ + 5.854645910263061, + -0.14227030426263923, + 1.6728234744071961 + ] + }, + "end_effector_position_relative": { + "max": [ + 0.8734620809555054, + 0.9110966920852661, + 1.1568338871002197 + ], + "min": [ + -0.46594083309173584, + -0.8583298325538635, + -0.05373392626643181 + ], + "mean": [ + 0.2642126679420471, + -0.037328604608774185, + 0.5461296439170839 + ], + "std": [ + 0.18807946145534513, + 0.2574712634086609, + 0.23820365965366327 + ], + "q01": [ + -0.25546322107315067, + -0.6193294751644135, + 0.07401651039719581 + ], + "q99": [ + 0.7370126557350158, + 0.6061198198795318, + 0.9726848316192626 + ] + }, + "end_effector_rotation_absolute": { + "max": [ + 0.9999977350234985, + 0.9999508857727051, + 0.9938080310821533, + 0.9647020697593689 + ], + "min": [ + -0.9724215269088745, + -0.9999970197677612, + -0.9864645600318909, + -0.9823317527770996 + ], + "mean": [ + 0.47370263934135426, + 0.0261838398873806, + 0.16200047731399536, + -0.04991654306650162 + ], + "std": [ + 0.41681185364723217, + 0.608587920665741, + 0.2949107885360718, + 0.3205944299697876 + ], + "q01": [ + -0.6978112339973449, + -0.9920166134834288, + -0.6591729152202607, + -0.7654285311698916 + ], + "q99": [ + 0.9918085932731626, + 0.9379134571552272, + 0.7881702184677124, + 0.7439913916587824 + ] + }, + "end_effector_rotation_relative": { + "max": [ + 0.9999980926513672, + 0.998063862323761, + 0.9959006905555725, + 0.9754553437232971 + ], + "min": [ + -0.9999996423721313, + -0.9978545904159546, + -0.9934196472167969, + 1.742676403182486e-08 + ], + "mean": [ + -0.15737581253051755, + 0.12279410660266878, + 0.0075049446895718575, + 0.19545103609561915 + ], + "std": [ + 0.7716760039329528, + 0.35558718442916865, + 0.34857386350631714, + 0.23019900918006903 + ], + "q01": [ + -0.9974887371063231, + -0.6873972046375275, + -0.774531364440918, + 0.0013910925900563597 + ], + "q99": [ + 0.9955765008926392, + 0.8495244240760801, + 0.7640593051910399, + 0.8050539493560789 + ] + }, + "gripper_qpos": { + "max": [ + 0.0993952602148056, + 0.12181679159402847 + ], + "min": [ + -0.034708939492702484, + -0.12887509167194366 + ], + "mean": [ + 0.03116090223193169, + -0.0305799338966608 + ], + "std": [ + 0.01336925104260443, + 0.01332307420670987 + ], + "q01": [ + -4.0376453907811085e-05, + -0.040308743715286255 + ], + "q99": [ + 0.04063318811357021, + -0.00040857097716070936 + ] + }, + "gripper_qvel": { + "max": [ + 3.6628172397613525, + 16.347944259643555 + ], + "min": [ + -3.065218210220337, + -1.0290303230285645 + ], + "mean": [ + 0.0007271775975823402, + -0.0007187921437434852 + ], + "std": [ + 0.01484494563192129, + 0.01693008281290531 + ], + "q01": [ + -0.06051826104521752, + -0.06142193399369715 + ], + "q99": [ + 0.06037045575678332, + 0.06032651640474791 + ] + }, + "joint_position": { + "max": [ + 2.914743423461914, + 1.7686904668807983, + 2.916494846343994, + -0.016307059675455093, + 2.939373016357422, + 3.790846109390259, + 2.954829454421997 + ], + "min": [ + -2.907266616821289, + -1.7821346521377563, + -2.9019548892974854, + -3.095099449157715, + -2.937913417816162, + -0.0009604570223018527, + -2.92836594581604 + ], + "mean": [ + -0.002226361073553562, + -0.7944630384445189, + -0.13673430681228638, + -2.1579504013061523, + 0.22298482060432434, + 1.9989188909530637, + 0.6232013106346129 + ], + "std": [ + 0.39000499248504633, + 0.6092476248741152, + 0.6991883516311646, + 0.6551395654678345, + 0.736649513244629, + 0.568905234336853, + 0.8584070205688478 + ], + "q01": [ + -1.1111454963684084, + -1.7629053592681887, + -1.9500407242774964, + -3.0719740390777583, + -1.3464225411415103, + 0.710475721359253, + -1.6062000060081483 + ], + "q99": [ + 1.4440935993194488, + 0.7535663640499111, + 1.9061625623702998, + -0.06965120136737823, + 2.8974728584289555, + 3.515558786392211, + 2.689405360221862 + ] + }, + "joint_position_cos": { + "max": [ + 1.0, + 1.0, + 1.0, + 0.9998670220375061, + 1.0, + 0.9999995231628418, + 1.0 + ], + "min": [ + -0.9743798971176147, + -0.20976868271827698, + -0.9747722744941711, + -0.9989193677902222, + -0.9796231389045715, + -1.0, + -0.9826104044914246 + ], + "mean": [ + 0.8548306226730348, + 0.5775746703147887, + 0.8072980046272277, + -0.4509732127189635, + 0.8170415759086608, + -0.32237827777862554, + 0.5677806138992308 + ], + "std": [ + 0.18712373077869357, + 0.3143303990364076, + 0.3379428088665012, + 0.5030199885368347, + 0.4237685203552249, + 0.4407167136669159, + 0.44018274545669567 + ], + "q01": [ + -0.02635294888168575, + -0.19092965126037595, + -0.431967819929123, + -0.9975776076316832, + -0.9703514575958252, + -0.9988470673561097, + -0.927694798707962 + ], + "q99": [ + 0.9999993443489074, + 0.9995868206024171, + 0.9999943375587463, + 0.9975753426551818, + 0.9999939799308778, + 0.7580517125129691, + 0.9997406005859374 + ] + }, + "joint_position_sin": { + "max": [ + 1.0, + 1.0, + 1.0, + -0.016306336969137192, + 1.0, + 1.0, + 1.0 + ], + "min": [ + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -0.6045919060707092, + -1.0 + ], + "mean": [ + -0.009903629310429098, + -0.6532469391822815, + -0.11583427339792252, + -0.7166314125061036, + 0.10584983974695206, + 0.8206916451454164, + 0.42647859454154957 + ], + "std": [ + 0.30439406633377075, + 0.45045891404151917, + 0.49071264266967773, + 0.26929175853729226, + 0.41677382588386536, + 0.28926900029182395, + 0.5585145950317384 + ], + "q01": [ + -0.8871335268020629, + -0.9998832941055298, + -0.9991828799247741, + -0.9997999668121336, + -0.9557854759693147, + -0.3653103184700012, + -0.9881189262866972 + ], + "q99": [ + 0.9552828037738794, + 0.6842438745498653, + 0.997663722038269, + -0.06941956743597984, + 0.9985226392745972, + 0.9999756217002866, + 0.9997976422309875 + ] + }, + "joint_velocity": { + "max": [ + 4.507856369018555, + 4.772953987121582, + 5.447414875030518, + 6.97592830657959, + 11.213868141174316, + 8.45352554321289, + 7.553228378295898 + ], + "min": [ + -4.643657207489014, + -3.8053159713745117, + -10.225778579711914, + -5.300220489501953, + -8.159320831298828, + -7.285642147064209, + -6.582308292388916 + ], + "mean": [ + -0.0051504699513316154, + 0.017983481287956238, + -0.0032231842633336795, + 0.018490538001060486, + 0.01154279615730047, + 0.03427882865071296, + -0.008020073175430298 + ], + "std": [ + 0.22057239711284635, + 0.3303338885307312, + 0.25083741545677185, + 0.3530645966529846, + 0.3323102593421936, + 0.3750852048397064, + 0.3499438166618347 + ], + "q01": [ + -0.736943724155426, + -1.0061864113807681, + -0.6962891066074371, + -1.0777667713165284, + -0.9365327370166779, + -1.0524059319496155, + -1.0341050219535823 + ], + "q99": [ + 0.6565076839923846, + 0.8276887702941891, + 0.7165593433380111, + 1.0437071704864485, + 1.0606284761428828, + 1.047166392803192, + 0.9755567121505715 + ] + } + }, + "action": { + "base_motion": { + "max": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "min": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "q01": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "q99": [ + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "control_mode": { + "max": [ + 0.0 + ], + "min": [ + 0.0 + ], + "mean": [ + 0.0 + ], + "std": [ + 0.0 + ], + "q01": [ + 0.0 + ], + "q99": [ + 0.0 + ] + }, + "end_effector_position": { + "max": [ + 1.0, + 1.0, + 1.0 + ], + "min": [ + -1.0, + -1.0, + -1.0 + ], + "mean": [ + 0.03037301078438759, + 0.003446326823905111, + -0.040158476680517204 + ], + "std": [ + 0.4068017899990082, + 0.3985259532928467, + 0.36361521482467646 + ], + "q01": [ + -0.9999999999999999, + -0.9999999999999999, + -0.9999999999999999 + ], + "q99": [ + 0.9999999999999999, + 0.9999999999999999, + 0.9999999999999999 + ] + }, + "end_effector_rotation": { + "max": [ + 1.0, + 1.0, + 1.0 + ], + "min": [ + -1.0, + -1.0, + -1.0 + ], + "mean": [ + 0.006670467555522919, + -0.022744668647646907, + -0.002840776927769184 + ], + "std": [ + 0.14142614603042605, + 0.1476612389087677, + 0.13538147509098053 + ], + "q01": [ + -0.41466328322887414, + -0.508793534040451, + -0.4084805303812027 + ], + "q99": [ + 0.4669318711757646, + 0.3791767048835726, + 0.4212867003679271 + ] + }, + "gripper_close": { + "max": [ + 1.0 + ], + "min": [ + 0.0 + ], + "mean": [ + 0.3556057214736939 + ], + "std": [ + 0.49354264140129084 + ], + "q01": [ + 0.0 + ], + "q99": [ + 1.0 + ] + } + }, + "total_trajectory_length": 9288, + "num_trajectories": 1691280 + }, + "modalities": { + "video": { + "res256_image_side_0": { + "resolution": [ + 256, + 256 + ], + "channels": 3, + "fps": 20.0 + }, + "res256_image_side_1": { + "resolution": [ + 256, + 256 + ], + "channels": 3, + "fps": 20.0 + }, + "res256_image_wrist_0": { + "resolution": [ + 256, + 256 + ], + "channels": 3, + "fps": 20.0 + } + }, + "state": { + "base_position": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + }, + "base_rotation": { + "absolute": true, + "rotation_type": "quaternion", + "shape": [ + 4 + ], + "continuous": true + }, + "end_effector_position_absolute": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + }, + "end_effector_position_relative": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + }, + "end_effector_rotation_absolute": { + "absolute": true, + "rotation_type": "quaternion", + "shape": [ + 4 + ], + "continuous": true + }, + "end_effector_rotation_relative": { + "absolute": true, + "rotation_type": "quaternion", + "shape": [ + 4 + ], + "continuous": true + }, + "gripper_qpos": { + "absolute": true, + "rotation_type": null, + "shape": [ + 2 + ], + "continuous": true + }, + "gripper_qvel": { + "absolute": true, + "rotation_type": null, + "shape": [ + 2 + ], + "continuous": true + }, + "joint_position": { + "absolute": true, + "rotation_type": null, + "shape": [ + 7 + ], + "continuous": true + }, + "joint_position_cos": { + "absolute": true, + "rotation_type": null, + "shape": [ + 7 + ], + "continuous": true + }, + "joint_position_sin": { + "absolute": true, + "rotation_type": null, + "shape": [ + 7 + ], + "continuous": true + }, + "joint_velocity": { + "absolute": true, + "rotation_type": null, + "shape": [ + 7 + ], + "continuous": true + } + }, + "action": { + "base_motion": { + "absolute": false, + "rotation_type": null, + "shape": [ + 4 + ], + "continuous": true + }, + "control_mode": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": false + }, + "end_effector_position": { + "absolute": false, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + }, + "end_effector_rotation": { + "absolute": false, + "rotation_type": "axis_angle", + "shape": [ + 3 + ], + "continuous": true + }, + "gripper_close": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": false + } + }, + "annotation": { + "human": [ + "action.task_description", + "action.task_name", + "validity" + ] + } + }, + "embodiment": { + "robot_name": "Franka", + "robot_type": "Panda+OmronBase", + "record_frequency": 20.0, + "body_controller_frequency": null, + "hand_controller_frequency": null, + "embodiment_tag": "robocasa_panda_omron" + }, + "processing": null, + "version": null + } +} \ No newline at end of file diff --git a/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae1101888ae87150d30994de598342322b43191d --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7dbb2d251044deb8f9acff69d9a8797ec150120943c7286831021e6cd344f +size 229582448 diff --git a/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae589fd811a4150e417b6f85bcf1296b5206ca40 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b31d4b429a8a14d5ce8e424f18eaf160610d8f75f9f459d54e3e3dd69ae404c +size 229582140 diff --git a/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..95a76cb50ce028dde47144ee23bf933219e21e8b --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddc1012d3521da3a017aa15d374259c4abdb2f0d52b830cd8eb67f3b77657ec0 +size 229582268 diff --git a/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb9f6f9aa5e0a257dab67174299eb31925880b84 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8f0dcef5cb996c826afb6a3de48a412bd7bb106c20bbec14364478e69696c34 +size 229582140 diff --git a/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c600fc3e8ede4849a4a7105c3db90612d80f0c7 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f808d2290a2328aad0bb97c6149c3dc2778ff2308bf05e2badcb09c93996697 +size 229582140 diff --git a/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b327bce7c5f144a2190d840b016d81a5004bc4d --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae17b7810ccfa4f7aaf4521645cb604c69a5752213823bd5e59a58a3ee8f9e2d +size 229582140 diff --git a/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..427540d97c7d49116d64171d1e2ea0b44a2c9f15 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21ca9fca0ba43394b6c2fa6735195897163250f1627856a6de0a522aeab5165d +size 229582076 diff --git a/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31fc4eac0fc8018cc71cc3b947d3646778e100ef --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d95dab1fb53717cd9cfbc147459cb13ef57197be641d3e46a9d9f64f0380bc17 +size 229581948 diff --git a/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86c5e2535a61f840831aeda057ff10bec5b07f74 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12677d2ca71d20331c5793349c9d4497cab4a4fbb0e443b87b111689413082d1 +size 229582268 diff --git a/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e227516b53df3b59bbe0685d5a0cb1ab864c037 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dce6ffab5363b1c6b8cf870e3dfb0a93913e4703687872658754401d1232fec2 +size 229581884 diff --git a/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2a113a8759ba6d1fb5adae17e9d09853902cd80 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22bfc4d32d70da39933a5161d4c02ff6bdd222adfe9f53a3db60dd6b3c498cdc +size 229581948 diff --git a/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1045f1f110c1da5304b7fdffb996004d87d9a8d3 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0beaaf9b55b900d8e660eaac3c510098962a2d9500915f2bc611a188ceacd20 +size 229582000 diff --git a/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e32da541dff7b959a3fd3858edafbef4df2befac --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b148fb577fa3c9f7100dc0f08de4a4f5f8a83d0f25c332501b21b37c3567c7c +size 229582012 diff --git a/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9053359904ca93d740b9ae1c17a6cfe89314c897 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3697a20e7e6020eb408f9955d74757d6f940f998906ab311fb94d6bbf773ee6f +size 229581692 diff --git a/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec2bba7004597fc5200f3309ad29d783d4346814 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9320cd5dd46980821ccb5871d9719cb3a9b8b2bcfeba281890bf62fb19af0702 +size 229581692 diff --git a/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a115a6600e8b922f105c28f742cba5b8aa1e789f --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:824cd3b2fe93cde3e90673d5eb76292238f217eb5d06ed86f18c54ccc172aed2 +size 229581756 diff --git a/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1036004d34846c0e5fc44842f1b40670760419d3 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd137aca3e137da84d345e38bebdd37f5d701a424489a3ad06fb2fb875827ea7 +size 229581180 diff --git a/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb5f09b0330a05684d9bf923dba853e51e7958f0 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3d159b12a4466a0cb31f19b06386291e40cfa26ddbb12b715c5b4d87e96eb0f +size 229580668 diff --git a/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fa6a7c590901c7aeb4e581db5b764742eb6f9de --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04119e882238ae560480e38eb3c57c42298a9ae9ca480e1b579c2a3a52244cca +size 229580732 diff --git a/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..529e1b1f56a33464469007876f88aad317c8e8b9 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a27132dae2b11fb83822ba08111d1a601bf386a2229668388cba8d0a18c7e9c +size 229581244 diff --git a/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb4dc403211ddafa9cf14be580043fc5936a02e7 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1eafc89c95c3bcc9f2e91770e68fad9ac8f190cd15b5ca37b981c9be10e5a4e +size 229581180 diff --git a/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..087d06fc1ac28d11e8ab43492bce0554a3bdff39 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35d74c93993be53ad46a1f7f95c11096335833852fceb1e9b561bdb21fcce28e +size 229581244 diff --git a/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd926d736129665db2dc8b07ec02b5eaae5bbb23 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3397fac9272c3247df75b04ba6d4c39c2d0805aa037bf8acca4098c747a96b24 +size 229582320 diff --git a/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8c2d70a5d257778c45cb4bffdda8cfa53b5f378 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32e0b1c19f6cdb9765abdc7c4b77bf4e3de99060c1fabc30b62719afd942c7a5 +size 229580732 diff --git a/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3faf087c241014abec1ba9ac6be7df8c576ee09e --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0118cd25a916783426663e72fe8697e3b084c8c03757a8f0a4f3b85d2900cf46 +size 229580988 diff --git a/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5317847d3a559cd53d3adab982b3a5087269a04e --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a26201057835f03cf2570c52770d5dc299909f6242e26b5a8af5386964e25cf1 +size 229582000 diff --git a/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fd4570c4af8c97364eea4aac5395759f3e39ef0 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eb66ab81a30e35ea7ae512dd4fb9d36615ca1ef4ffb5cf128d3eb662a6568ee +size 229582320 diff --git a/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2155f1f0e6db0698e85f870d994fbd65d2aa39e4 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b17016e84e25b047fea4fe88223163f7f2a82a5fde149fad1283c66d09233d +size 229581872 diff --git a/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45b599d204c68cf8049a8265ce034514f07e8fa1 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98a052411719c96070e3ffb4c42d20375df2eb1da87af4664c9387cf77815d3e +size 229582256 diff --git a/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ce1f61812c7ade41556ae9c2cbd8e1da798b2a7 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d9ab0ad83d200ffc89a41122762d48e52c29d4501140b5a096fc2eb72c965d +size 229582000 diff --git a/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0178919ee2a6c2f8a2ac1b2aa71f5f036d6642e4 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e0879971bbb1a7ac429e144e49e3c1bf67fb92c441309612f6e609fab6d6e40 +size 229582256 diff --git a/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa5b83ac327a5624674adc3c668a3cbeb143f939 --- /dev/null +++ b/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b986897bd9748bd2ea23b3a9dd786e0ad2029e34a2ae5e4e1db5131fa1a24a75 +size 229582256 diff --git a/global_step60000/mp_rank_00_model_states.pt b/global_step60000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aabdf9f18aaf5949ba8af3043d0d7e9a9817762f --- /dev/null +++ b/global_step60000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e865663cf02bb399c9c3c2b5b733ac95c9b16e8a890640bc6ff3ebeede071bd3 +size 1325393186 diff --git a/latest b/latest new file mode 100644 index 0000000000000000000000000000000000000000..17323c771c50a997c698ff34e921f897464ce2a5 --- /dev/null +++ b/latest @@ -0,0 +1 @@ +global_step60000 \ No newline at end of file diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..38fe0b4f6b57e2227bcb63c11002a2f464338a1e --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f30bcb0b53eba6e4225fec41e8b953c7cd77221320aaca1285e3e5321cc3b8e +size 1274869220 diff --git a/rng_state_0.pth b/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ad8fdfdc7e4edfdc91f812c755e75d181b19eda --- /dev/null +++ b/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b5a06ff249b9991f1fd615d938bda8244b255458c851656ac52db6479ce1940 +size 14256 diff --git a/rng_state_1.pth b/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e0b80b0c4f4d279a9a52f72b2510374caa202cd --- /dev/null +++ b/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94d40bdbddd3cd698c19a0494bda3b649ed693b95ca1d680a10f37b2eccf250b +size 14256 diff --git a/rng_state_10.pth b/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..57d49448205f7f72f83e6f8cf1cd45f87aad892e --- /dev/null +++ b/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a33f183849df8388d54d79473a5a9eeb518752e33d0f1faee0f17b149a6ac205 +size 14262 diff --git a/rng_state_11.pth b/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..08ce4ae3ea3d0523d4b333b0082a7129da4b8f85 --- /dev/null +++ b/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c94abbc1d333765a5585d72a643f24cae81ca7fe1161a874e7dcaabbf7bd29d +size 14262 diff --git a/rng_state_12.pth b/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..3bfc2f8b8acbdbb7aaa9f32d91e85c8504a47000 --- /dev/null +++ b/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b56499538e7d62fbd4d2bff54cbb667287939a0f6a94bf9e1cb09c201c3768c +size 14262 diff --git a/rng_state_13.pth b/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..a175e92bef35a0614729505680f2d0c74672aee9 --- /dev/null +++ b/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06acfc0f3525f99c12bb83e8c544b9326b328c5a4d2ef517cb977b9794bebe4c +size 14262 diff --git a/rng_state_14.pth b/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a4cfdf8f71788b648a268c5d15b4aba26665c55 --- /dev/null +++ b/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:437d4f5244d715af7f41df1f8dd9cd7c280e626d93971bf9615875624142ce4a +size 14262 diff --git a/rng_state_15.pth b/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..da201fd61ab3810525cd8ba0761b16a3f0901c59 --- /dev/null +++ b/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7646ef496596683994b711ca3cc7e183550944e94fb5ca099b386f1b40861c5 +size 14262 diff --git a/rng_state_16.pth b/rng_state_16.pth new file mode 100644 index 0000000000000000000000000000000000000000..0aad331a40b9bfcf5a273d427c878edec8eadcde --- /dev/null +++ b/rng_state_16.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c889ad312c69a6bc978e18ff28c3ee88a5123c118d1d54af319fed32de920f5 +size 14262 diff --git a/rng_state_17.pth b/rng_state_17.pth new file mode 100644 index 0000000000000000000000000000000000000000..f7c88fb83a255b8e3c8a157e3a70a0c96ecd2bb4 --- /dev/null +++ b/rng_state_17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0e6b4ebb5992685933e5e9eae6dc704358b5b430ee7474261ee21e4da4bf946 +size 14262 diff --git a/rng_state_18.pth b/rng_state_18.pth new file mode 100644 index 0000000000000000000000000000000000000000..73661021c4ed2b875870b0e76503a4645675350e --- /dev/null +++ b/rng_state_18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccd8bfc980810b4fd4cdab925c4cff5e8864e666db57a8c602e36d96ea2f9c9b +size 14262 diff --git a/rng_state_19.pth b/rng_state_19.pth new file mode 100644 index 0000000000000000000000000000000000000000..b4cbdad14225048915ef318aaefa142267171361 --- /dev/null +++ b/rng_state_19.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa6e4ec6b51790359e91f854f693f69114d622273f56c416a7731200191962a2 +size 14262 diff --git a/rng_state_2.pth b/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..08fdd7277117c1c17ae30ad6aa40e971f6058811 --- /dev/null +++ b/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a1aa094029f49a0e093be84c1a9739db4bd59eb236bd1886fe81ccb1043d525 +size 14256 diff --git a/rng_state_20.pth b/rng_state_20.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c216f193797c03ec18ea1a6f00cda2e9c6579eb --- /dev/null +++ b/rng_state_20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d71ad7b3a19ce104c17291028f5e5a0964998b0c51010f1ce3d72eb8a673349d +size 14262 diff --git a/rng_state_21.pth b/rng_state_21.pth new file mode 100644 index 0000000000000000000000000000000000000000..72c2c6617e6a63f2cdd1ba2d4ee4d2b3a76240fc --- /dev/null +++ b/rng_state_21.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72ebfea3032ffbec90d97749f70eeadb1788143c82e2662864ca78e6aba6fbbe +size 14262 diff --git a/rng_state_22.pth b/rng_state_22.pth new file mode 100644 index 0000000000000000000000000000000000000000..ed4c7b86fdb400b0b42a729dfcb4554b7d0033d9 --- /dev/null +++ b/rng_state_22.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04a6e379d538da2a5038c06f843c9d526f5ee293ed318df5cd719049173fe8da +size 14262 diff --git a/rng_state_23.pth b/rng_state_23.pth new file mode 100644 index 0000000000000000000000000000000000000000..d05508f29cf3f994d26a843de9a7e8ed8fb027ca --- /dev/null +++ b/rng_state_23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6069253e1f3d7d01d5cd68b4d3b7199beca920554a5ffdee2fec2f5611f6f665 +size 14262 diff --git a/rng_state_24.pth b/rng_state_24.pth new file mode 100644 index 0000000000000000000000000000000000000000..9f6c0ec908287d45394091400896b43a9c98a09d --- /dev/null +++ b/rng_state_24.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:394d797f15aa4f9ec3015237395b72eaa56bac0dcb421265d19a7b99b09a6de7 +size 14262 diff --git a/rng_state_25.pth b/rng_state_25.pth new file mode 100644 index 0000000000000000000000000000000000000000..4620149cad3284e1b62fb55f2bc3c4e360011c8f --- /dev/null +++ b/rng_state_25.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:119ffa9d112b5b168bb79d1652fdf255715599cafe12a55f23c8fb1be4c07d65 +size 14262 diff --git a/rng_state_26.pth b/rng_state_26.pth new file mode 100644 index 0000000000000000000000000000000000000000..20bb8a883255756f2018d8ea04d8d84fd39bfa7f --- /dev/null +++ b/rng_state_26.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81ff8232c02a6494b5d83a60cf2417950d0fd12c01dc61bb78095a704d8fa855 +size 14262 diff --git a/rng_state_27.pth b/rng_state_27.pth new file mode 100644 index 0000000000000000000000000000000000000000..95e528330cc22490a0955d72cf9e3d3bb305409c --- /dev/null +++ b/rng_state_27.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2d47048e8424309a097fc6e3c177a9587017a79e037fe52b63948c50907af24 +size 14262 diff --git a/rng_state_28.pth b/rng_state_28.pth new file mode 100644 index 0000000000000000000000000000000000000000..78b947940ba6668d8da217b31d3d74583850f5ff --- /dev/null +++ b/rng_state_28.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81624ff172da08cda31ff267a304d790774cfa9f4790b8f87ba926d6f4461732 +size 14262 diff --git a/rng_state_29.pth b/rng_state_29.pth new file mode 100644 index 0000000000000000000000000000000000000000..407c7f767c79504292914568304ce52923732f08 --- /dev/null +++ b/rng_state_29.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e97dc6401f8d7a7d911aa560a94125deb4d7ff652e5d772121c0dce00b24a4cd +size 14262 diff --git a/rng_state_3.pth b/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3dda0faa6f8a93045c31cf600b6f231515049ed5 --- /dev/null +++ b/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59f7ea91829ae1e2a0d69668197bb1dafaaf4f51a6a75bf51303eaf06b14af15 +size 14256 diff --git a/rng_state_30.pth b/rng_state_30.pth new file mode 100644 index 0000000000000000000000000000000000000000..b1be4c58b09d10f4eef7a15cb3786705c1c21b9c --- /dev/null +++ b/rng_state_30.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f13e89d28c6021c082a7bde39bf4f8a4a36c13b6db75552b421c128c58a5e3d +size 14262 diff --git a/rng_state_31.pth b/rng_state_31.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd1a35fb3d3f743c35a54a6634b50e044d8ef0d5 --- /dev/null +++ b/rng_state_31.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e64d85325f4aca7febb35898eaea978a904e728fccdf7c03a6fe95ebfbc726a +size 14262 diff --git a/rng_state_4.pth b/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd3180ccabbfe2c67aff191dd5fe97d4cb0f4669 --- /dev/null +++ b/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a12e14469bad046d0f8f7448de6f3572f9853078b65a0d903a2c2107c34fd123 +size 14256 diff --git a/rng_state_5.pth b/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c81aac21e02f799ae7b21ead6a84d1e7d278962d --- /dev/null +++ b/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b86f2b7807c09cfe91c8ac5da498030189578c3ad3888d8317dbf18d8ad4a52b +size 14256 diff --git a/rng_state_6.pth b/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..b15c8d2cf52de4634794f81c8245c1c00d4c4cd4 --- /dev/null +++ b/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7802d1aaa48a68bc4371e93bb38c37e1120234f5b6c0a6fa1183b4759cc8ac9 +size 14256 diff --git a/rng_state_7.pth b/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e6a50712304302f10619e502b706fdbeb686fc2 --- /dev/null +++ b/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:167e9c190a20c8ee971ce9c9f293677908415c9b3c55cea34de9db054a1f40f9 +size 14256 diff --git a/rng_state_8.pth b/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf42a7b0f1deddadd4d4d38409f382d38699e7a6 --- /dev/null +++ b/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e16ddc8153bc4b53884699d1f6780ae6b33aa3c82b1dfa5ea5c77bf551da56a2 +size 14256 diff --git a/rng_state_9.pth b/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..d2195fb04385996c366094c6fe9ea78d9ff256b7 --- /dev/null +++ b/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20d9138b3e3b1dcc348d63242ed874f11b1586463d7370bcce68b376672b0a46 +size 14256 diff --git a/scheduler.pt b/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..285118469819c351494c475e86bcd62485ae8ece --- /dev/null +++ b/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb677b69d5815e7e1f5f341ea75c4bd0f7789a7a85a36e672ade3f744edffa5f +size 1064 diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..32bbe60ca5e2145640c01d658ffab0e3d1febefe --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edc974ba58e57a36428c60b3eead2f2b8a4511b0dac441f037b5e7a36bca7a8f +size 15103512 diff --git a/zero_to_fp32.py b/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters)