Encrux commited on
Commit
417f4c0
·
verified ·
1 Parent(s): bc6b5d7

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-100000/config.json +89 -0
  2. checkpoint-100000/embodiment_id.json +31 -0
  3. checkpoint-100000/experiment_cfg/conf.yaml +298 -0
  4. checkpoint-100000/experiment_cfg/config.yaml +336 -0
  5. checkpoint-100000/experiment_cfg/dataset_statistics.json +0 -0
  6. checkpoint-100000/experiment_cfg/final_model_config.json +70 -0
  7. checkpoint-100000/experiment_cfg/final_processor_config.json +0 -0
  8. checkpoint-100000/model-00001-of-00002.safetensors +3 -0
  9. checkpoint-100000/model-00002-of-00002.safetensors +3 -0
  10. checkpoint-100000/model.safetensors.index.json +0 -0
  11. checkpoint-100000/processor_config.json +2816 -0
  12. checkpoint-100000/rng_state.pth +3 -0
  13. checkpoint-100000/scheduler.pt +3 -0
  14. checkpoint-100000/statistics.json +0 -0
  15. checkpoint-100000/trainer_state.json +0 -0
  16. checkpoint-100000/training_args.bin +3 -0
  17. checkpoint-100000/wandb_config.json +1 -0
  18. checkpoint-80000/config.json +89 -0
  19. checkpoint-80000/embodiment_id.json +31 -0
  20. checkpoint-80000/experiment_cfg/conf.yaml +298 -0
  21. checkpoint-80000/experiment_cfg/config.yaml +336 -0
  22. checkpoint-80000/experiment_cfg/dataset_statistics.json +0 -0
  23. checkpoint-80000/experiment_cfg/final_model_config.json +70 -0
  24. checkpoint-80000/experiment_cfg/final_processor_config.json +0 -0
  25. checkpoint-80000/model-00001-of-00002.safetensors +3 -0
  26. checkpoint-80000/model-00002-of-00002.safetensors +3 -0
  27. checkpoint-80000/model.safetensors.index.json +0 -0
  28. checkpoint-80000/processor_config.json +2816 -0
  29. checkpoint-80000/rng_state.pth +3 -0
  30. checkpoint-80000/scheduler.pt +3 -0
  31. checkpoint-80000/statistics.json +0 -0
  32. checkpoint-80000/trainer_state.json +0 -0
  33. checkpoint-80000/training_args.bin +3 -0
  34. checkpoint-80000/wandb_config.json +1 -0
  35. checkpoint-85000/config.json +89 -0
  36. checkpoint-85000/embodiment_id.json +31 -0
  37. checkpoint-85000/experiment_cfg/conf.yaml +298 -0
  38. checkpoint-85000/experiment_cfg/config.yaml +336 -0
  39. checkpoint-85000/experiment_cfg/dataset_statistics.json +0 -0
  40. checkpoint-85000/experiment_cfg/final_model_config.json +70 -0
  41. checkpoint-85000/experiment_cfg/final_processor_config.json +0 -0
  42. checkpoint-85000/model-00001-of-00002.safetensors +3 -0
  43. checkpoint-85000/model-00002-of-00002.safetensors +3 -0
  44. checkpoint-85000/model.safetensors.index.json +0 -0
  45. checkpoint-85000/processor_config.json +2816 -0
  46. checkpoint-85000/rng_state.pth +3 -0
  47. checkpoint-85000/scheduler.pt +3 -0
  48. checkpoint-85000/statistics.json +0 -0
  49. checkpoint-85000/trainer_state.json +0 -0
  50. checkpoint-85000/training_args.bin +3 -0
checkpoint-100000/config.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": true,
5
+ "architectures": [
6
+ "Gr00tN1d6"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_model_type": "eagle",
12
+ "backbone_trainable_params_fp32": true,
13
+ "collator_overwrite_image_inputs": false,
14
+ "color_jitter_params": {
15
+ "brightness": 0.1,
16
+ "contrast": 0.1,
17
+ "hue": 0.1,
18
+ "saturation": 0.1
19
+ },
20
+ "crop_fraction": 0.95,
21
+ "diffusion_model_cfg": {
22
+ "attention_head_dim": 48,
23
+ "dropout": 0.2,
24
+ "final_dropout": true,
25
+ "interleave_self_attention": true,
26
+ "norm_type": "ada_norm",
27
+ "num_attention_heads": 32,
28
+ "num_layers": 32,
29
+ "output_dim": 1024,
30
+ "positional_embeddings": null
31
+ },
32
+ "eagle_collator": true,
33
+ "formalize_language": true,
34
+ "gemma_collator": false,
35
+ "hidden_size": 1024,
36
+ "image_crop_size": null,
37
+ "image_target_size": null,
38
+ "input_embedding_dim": 1536,
39
+ "load_bf16": true,
40
+ "max_action_dim": 128,
41
+ "max_num_embodiments": 32,
42
+ "max_seq_len": 1024,
43
+ "max_state_dim": 128,
44
+ "model_dtype": "bfloat16",
45
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
+ "model_type": "Gr00tN1d6",
47
+ "noise_beta_alpha": 1.5,
48
+ "noise_beta_beta": 1.0,
49
+ "noise_s": 0.999,
50
+ "num_inference_timesteps": 4,
51
+ "num_timestep_buckets": 1000,
52
+ "random_rotation_angle": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 16,
55
+ "shortest_image_edge": 256,
56
+ "soft_prompt_lr_scale": 1.0,
57
+ "soft_prompt_num_tokens": 32,
58
+ "state_dropout_prob": 0.0,
59
+ "state_dropout_prob_per_embodiment": {
60
+ "cmr_versius": 1.0,
61
+ "hamlyn_dvrk_30hz": 1.0,
62
+ "jhu_imerse_dvrk": 1.0,
63
+ "jhu_imerse_dvrk_mono": 1.0,
64
+ "jhu_imerse_star_il": 1.0,
65
+ "jhu_lscr_dvrk_smarts": 1.0,
66
+ "obuda_dvrk": 1.0,
67
+ "rob_surgical_bitrack": 1.0,
68
+ "stanford_dvrk_real": 1.0,
69
+ "tud_tundra_ur5e": 1.0,
70
+ "turin_mitic_ex_vivo": 1.0,
71
+ "ucb_dvrk": 1.0,
72
+ "ucsd_dvrk": 1.0,
73
+ "ustc_torin_tuodao": 1.0
74
+ },
75
+ "torch_dtype": "bfloat16",
76
+ "transformers_version": "4.51.3",
77
+ "tune_diffusion_model": true,
78
+ "tune_llm": false,
79
+ "tune_projector": true,
80
+ "tune_top_llm_layers": 4,
81
+ "tune_visual": false,
82
+ "tune_vlln": true,
83
+ "use_albumentations_transforms": true,
84
+ "use_alternate_vl_dit": true,
85
+ "use_flash_attention": true,
86
+ "use_relative_action": true,
87
+ "use_soft_prompts": false,
88
+ "use_vlln": true
89
+ }
checkpoint-100000/embodiment_id.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "new_embodiment": 10,
10
+ "jhu_imerse_dvrk": 3,
11
+ "cmr_versius": 4,
12
+ "ucb_dvrk": 5,
13
+ "sanoscience_sim": 6,
14
+ "tum_sonata_franka": 7,
15
+ "hamlyn_dvrk_15hz": 9,
16
+ "hamlyn_dvrk_30hz": 11,
17
+ "ustc_torin_tuodao": 12,
18
+ "ucsd_dvrk": 14,
19
+ "jhu_imerse_dvrk_mono": 15,
20
+ "rob_surgical_bitrack": 16,
21
+ "stanford_dvrk_real": 17,
22
+ "obuda_dvrk": 18,
23
+ "polyu_sim": 19,
24
+ "moon_maestro": 21,
25
+ "jhu_lscr_dvrk_miracle": 22,
26
+ "jhu_lscr_dvrk_smarts": 23,
27
+ "jhu_imerse_star_il": 27,
28
+ "tud_tundra_ur5e": 25,
29
+ "turin_mitic_ex_vivo": 26,
30
+ "oxe_droid": 29
31
+ }
checkpoint-100000/experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d6
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Eagle-Block2A-2B-v2
6
+ backbone_model_type: eagle
7
+ model_revision: null
8
+ tune_top_llm_layers: 4
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 16
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ collator_overwrite_image_inputs: false
17
+ eagle_collator: true
18
+ backbone_trainable_params_fp32: true
19
+ image_crop_size: null
20
+ image_target_size: null
21
+ shortest_image_edge: 256
22
+ crop_fraction: 0.95
23
+ random_rotation_angle: null
24
+ color_jitter_params: null
25
+ use_albumentations_transforms: true
26
+ extra_augmentation_config: null
27
+ formalize_language: true
28
+ apply_sincos_state_encoding: false
29
+ use_relative_action: true
30
+ max_state_dim: 29
31
+ max_action_dim: 29
32
+ action_horizon: 50
33
+ hidden_size: 1024
34
+ input_embedding_dim: 1536
35
+ add_pos_embed: true
36
+ attn_dropout: 0.2
37
+ use_vlln: true
38
+ max_seq_len: 1024
39
+ use_alternate_vl_dit: true
40
+ attend_text_every_n_blocks: 2
41
+ diffusion_model_cfg:
42
+ positional_embeddings: null
43
+ num_layers: 32
44
+ num_attention_heads: 32
45
+ attention_head_dim: 48
46
+ norm_type: ada_norm
47
+ dropout: 0.2
48
+ final_dropout: true
49
+ output_dim: 1024
50
+ interleave_self_attention: true
51
+ num_inference_timesteps: 4
52
+ noise_beta_alpha: 1.5
53
+ noise_beta_beta: 1.0
54
+ noise_s: 0.999
55
+ num_timestep_buckets: 1000
56
+ tune_projector: true
57
+ tune_diffusion_model: true
58
+ tune_vlln: true
59
+ state_dropout_prob: 0.0
60
+ state_dropout_prob_per_embodiment: null
61
+ state_additive_noise_scale: 0.0
62
+ max_num_embodiments: 32
63
+ data:
64
+ datasets:
65
+ - dataset_paths:
66
+ - /hkfs/work/workspace/scratch/uenyr-thesis/data/peg_transfer_lerobot
67
+ embodiment_tag: jhu_imerse_dvrk_mono
68
+ mix_ratio: 1.0
69
+ dataset_type: physical_embodiment
70
+ val_dataset_path: null
71
+ exclude_splits: null
72
+ include_splits: null
73
+ modality_configs:
74
+ jhu_imerse_dvrk_mono:
75
+ video:
76
+ delta_indices:
77
+ - 0
78
+ modality_keys:
79
+ - endoscope_left
80
+ sin_cos_embedding_keys: null
81
+ mean_std_embedding_keys: null
82
+ min_max_embedding_keys: null
83
+ pass_through_keys: null
84
+ action_configs: null
85
+ state:
86
+ delta_indices:
87
+ - 0
88
+ modality_keys:
89
+ - psm1_pose
90
+ - psm1_gripper
91
+ - psm2_pose
92
+ - psm2_gripper
93
+ sin_cos_embedding_keys: null
94
+ mean_std_embedding_keys:
95
+ - psm1_pose
96
+ - psm1_gripper
97
+ - psm2_pose
98
+ - psm2_gripper
99
+ min_max_embedding_keys: null
100
+ pass_through_keys: null
101
+ action_configs: null
102
+ action:
103
+ delta_indices:
104
+ - 0
105
+ - 1
106
+ - 2
107
+ - 3
108
+ - 4
109
+ - 5
110
+ - 6
111
+ - 7
112
+ - 8
113
+ - 9
114
+ - 10
115
+ - 11
116
+ - 12
117
+ - 13
118
+ - 14
119
+ - 15
120
+ - 16
121
+ - 17
122
+ - 18
123
+ - 19
124
+ - 20
125
+ - 21
126
+ - 22
127
+ - 23
128
+ - 24
129
+ - 25
130
+ - 26
131
+ - 27
132
+ - 28
133
+ - 29
134
+ - 30
135
+ - 31
136
+ - 32
137
+ - 33
138
+ - 34
139
+ - 35
140
+ - 36
141
+ - 37
142
+ - 38
143
+ - 39
144
+ - 40
145
+ - 41
146
+ - 42
147
+ - 43
148
+ - 44
149
+ - 45
150
+ - 46
151
+ - 47
152
+ - 48
153
+ - 49
154
+ modality_keys:
155
+ - psm1_pose
156
+ - psm1_gripper
157
+ - psm2_pose
158
+ - psm2_gripper
159
+ sin_cos_embedding_keys: null
160
+ mean_std_embedding_keys: null
161
+ min_max_embedding_keys: null
162
+ pass_through_keys: null
163
+ action_configs:
164
+ - rep: REL_XYZ_ROT6D
165
+ type: EEF
166
+ format: XYZ_ROT6D
167
+ state_key: psm1_pose
168
+ input_rotation_format: quat
169
+ input_quat_order: xyzw
170
+ reference_rotation_format: quat
171
+ reference_quat_order: xyzw
172
+ translation_scaling_key: null
173
+ rotation_scaling_key: null
174
+ hold_through_clutch: false
175
+ normalization_type: temporal_meanstd
176
+ - rep: ABSOLUTE
177
+ type: NON_EEF
178
+ format: DEFAULT
179
+ state_key: null
180
+ input_rotation_format: quat
181
+ input_quat_order: xyzw
182
+ reference_rotation_format: rot6d
183
+ reference_quat_order: xyzw
184
+ translation_scaling_key: null
185
+ rotation_scaling_key: null
186
+ hold_through_clutch: false
187
+ normalization_type: temporal_meanstd
188
+ - rep: REL_XYZ_ROT6D
189
+ type: EEF
190
+ format: XYZ_ROT6D
191
+ state_key: psm2_pose
192
+ input_rotation_format: quat
193
+ input_quat_order: xyzw
194
+ reference_rotation_format: quat
195
+ reference_quat_order: xyzw
196
+ translation_scaling_key: null
197
+ rotation_scaling_key: null
198
+ hold_through_clutch: false
199
+ normalization_type: temporal_meanstd
200
+ - rep: ABSOLUTE
201
+ type: NON_EEF
202
+ format: DEFAULT
203
+ state_key: null
204
+ input_rotation_format: quat
205
+ input_quat_order: xyzw
206
+ reference_rotation_format: rot6d
207
+ reference_quat_order: xyzw
208
+ translation_scaling_key: null
209
+ rotation_scaling_key: null
210
+ hold_through_clutch: false
211
+ normalization_type: temporal_meanstd
212
+ language:
213
+ delta_indices:
214
+ - 0
215
+ modality_keys:
216
+ - annotation.human.task_description
217
+ sin_cos_embedding_keys: null
218
+ mean_std_embedding_keys: null
219
+ min_max_embedding_keys: null
220
+ pass_through_keys: null
221
+ action_configs: null
222
+ download_cache: false
223
+ shard_size: 1024
224
+ episode_sampling_rate: 0.1
225
+ num_shards_per_epoch: 100000
226
+ override_pretraining_statistics: true
227
+ mode: single_turn
228
+ random_chop: 0.0
229
+ mock_dataset_mode: false
230
+ shuffle: true
231
+ seed: 42
232
+ multiprocessing_context: fork
233
+ allow_padding: false
234
+ subsample_ratio: 1.0
235
+ image_crop_size:
236
+ - 244
237
+ - 244
238
+ image_target_size:
239
+ - 224
240
+ - 224
241
+ video_backend: torchcodec
242
+ training:
243
+ output_dir: /hkfs/work/workspace/scratch/uenyr-thesis/outputs/groot_finetune_v2
244
+ experiment_name: null
245
+ max_steps: 100000
246
+ global_batch_size: 8
247
+ batch_size: null
248
+ gradient_accumulation_steps: 1
249
+ learning_rate: 0.0001
250
+ lr_scheduler_type: cosine
251
+ weight_decay: 1.0e-05
252
+ warmup_ratio: 0.05
253
+ warmup_steps: 0
254
+ max_grad_norm: 1.0
255
+ optim: adamw_torch
256
+ start_from_checkpoint: /hkfs/work/workspace/scratch/uenyr-thesis/pretrained/GR00T-H
257
+ tf32: true
258
+ fp16: false
259
+ bf16: true
260
+ eval_bf16: true
261
+ logging_steps: 10
262
+ save_steps: 10000
263
+ save_total_limit: 5
264
+ save_vl_model: false
265
+ upload_checkpoints: false
266
+ upload_every: 1000
267
+ upload_last_n_checkpoints: 5
268
+ max_concurrent_uploads: 2
269
+ eval_strategy: 'no'
270
+ eval_steps: 500
271
+ eval_set_split_ratio: 0.1
272
+ eval_batch_size: 2
273
+ save_best_eval_metric_name: ''
274
+ save_best_eval_metric_greater_is_better: true
275
+ deepspeed_stage: 2
276
+ gradient_checkpointing: false
277
+ transformers_trust_remote_code: true
278
+ transformers_local_files_only: false
279
+ transformers_cache_dir: null
280
+ transformers_access_token: null
281
+ use_ddp: false
282
+ ddp_bucket_cap_mb: 100
283
+ num_gpus: 1
284
+ dataloader_num_workers: 4
285
+ remove_unused_columns: false
286
+ use_wandb: false
287
+ wandb_project: finetune-gr00t-n1d6
288
+ enable_profiling: false
289
+ max_retries: 3
290
+ assert_loss_less_than: null
291
+ add_rl_callback: false
292
+ enable_open_loop_eval: false
293
+ open_loop_eval_traj_ids:
294
+ - 0
295
+ open_loop_eval_steps_per_traj: 100
296
+ open_loop_eval_plot_indices: null
297
+ max_steps: 100000
298
+ save_steps: 10000
checkpoint-100000/experiment_cfg/config.yaml ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - /hkfs/work/workspace/scratch/uenyr-thesis/data/peg_transfer_lerobot
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: jhu_imerse_dvrk_mono
10
+ exclude_splits: null
11
+ include_splits: null
12
+ mix_ratio: 1.0
13
+ val_dataset_path: null
14
+ download_cache: false
15
+ episode_sampling_rate: 0.1
16
+ image_crop_size:
17
+ - 244
18
+ - 244
19
+ image_target_size:
20
+ - 224
21
+ - 224
22
+ mock_dataset_mode: false
23
+ modality_configs:
24
+ jhu_imerse_dvrk_mono:
25
+ action: !!python/object:gr00t.data.types.ModalityConfig
26
+ action_configs:
27
+ - !!python/object:gr00t.data.types.ActionConfig
28
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
29
+ - xyz+rot6d
30
+ hold_through_clutch: false
31
+ input_quat_order: xyzw
32
+ input_rotation_format: quat
33
+ normalization_type: temporal_meanstd
34
+ reference_quat_order: xyzw
35
+ reference_rotation_format: quat
36
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
37
+ - rel_xyz_rot6d
38
+ rotation_scaling_key: null
39
+ state_key: psm1_pose
40
+ translation_scaling_key: null
41
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
42
+ - eef
43
+ - !!python/object:gr00t.data.types.ActionConfig
44
+ format: &id004 !!python/object/apply:gr00t.data.types.ActionFormat
45
+ - default
46
+ hold_through_clutch: false
47
+ input_quat_order: xyzw
48
+ input_rotation_format: quat
49
+ normalization_type: temporal_meanstd
50
+ reference_quat_order: xyzw
51
+ reference_rotation_format: rot6d
52
+ rep: &id005 !!python/object/apply:gr00t.data.types.ActionRepresentation
53
+ - absolute
54
+ rotation_scaling_key: null
55
+ state_key: null
56
+ translation_scaling_key: null
57
+ type: &id006 !!python/object/apply:gr00t.data.types.ActionType
58
+ - non_eef
59
+ - !!python/object:gr00t.data.types.ActionConfig
60
+ format: *id001
61
+ hold_through_clutch: false
62
+ input_quat_order: xyzw
63
+ input_rotation_format: quat
64
+ normalization_type: temporal_meanstd
65
+ reference_quat_order: xyzw
66
+ reference_rotation_format: quat
67
+ rep: *id002
68
+ rotation_scaling_key: null
69
+ state_key: psm2_pose
70
+ translation_scaling_key: null
71
+ type: *id003
72
+ - !!python/object:gr00t.data.types.ActionConfig
73
+ format: *id004
74
+ hold_through_clutch: false
75
+ input_quat_order: xyzw
76
+ input_rotation_format: quat
77
+ normalization_type: temporal_meanstd
78
+ reference_quat_order: xyzw
79
+ reference_rotation_format: rot6d
80
+ rep: *id005
81
+ rotation_scaling_key: null
82
+ state_key: null
83
+ translation_scaling_key: null
84
+ type: *id006
85
+ delta_indices:
86
+ - 0
87
+ - 1
88
+ - 2
89
+ - 3
90
+ - 4
91
+ - 5
92
+ - 6
93
+ - 7
94
+ - 8
95
+ - 9
96
+ - 10
97
+ - 11
98
+ - 12
99
+ - 13
100
+ - 14
101
+ - 15
102
+ - 16
103
+ - 17
104
+ - 18
105
+ - 19
106
+ - 20
107
+ - 21
108
+ - 22
109
+ - 23
110
+ - 24
111
+ - 25
112
+ - 26
113
+ - 27
114
+ - 28
115
+ - 29
116
+ - 30
117
+ - 31
118
+ - 32
119
+ - 33
120
+ - 34
121
+ - 35
122
+ - 36
123
+ - 37
124
+ - 38
125
+ - 39
126
+ - 40
127
+ - 41
128
+ - 42
129
+ - 43
130
+ - 44
131
+ - 45
132
+ - 46
133
+ - 47
134
+ - 48
135
+ - 49
136
+ mean_std_embedding_keys: null
137
+ min_max_embedding_keys: null
138
+ modality_keys:
139
+ - psm1_pose
140
+ - psm1_gripper
141
+ - psm2_pose
142
+ - psm2_gripper
143
+ pass_through_keys: null
144
+ sin_cos_embedding_keys: null
145
+ language: !!python/object:gr00t.data.types.ModalityConfig
146
+ action_configs: null
147
+ delta_indices:
148
+ - 0
149
+ mean_std_embedding_keys: null
150
+ min_max_embedding_keys: null
151
+ modality_keys:
152
+ - annotation.human.task_description
153
+ pass_through_keys: null
154
+ sin_cos_embedding_keys: null
155
+ state: !!python/object:gr00t.data.types.ModalityConfig
156
+ action_configs: null
157
+ delta_indices:
158
+ - 0
159
+ mean_std_embedding_keys:
160
+ - psm1_pose
161
+ - psm1_gripper
162
+ - psm2_pose
163
+ - psm2_gripper
164
+ min_max_embedding_keys: null
165
+ modality_keys:
166
+ - psm1_pose
167
+ - psm1_gripper
168
+ - psm2_pose
169
+ - psm2_gripper
170
+ pass_through_keys: null
171
+ sin_cos_embedding_keys: null
172
+ video: !!python/object:gr00t.data.types.ModalityConfig
173
+ action_configs: null
174
+ delta_indices:
175
+ - 0
176
+ mean_std_embedding_keys: null
177
+ min_max_embedding_keys: null
178
+ modality_keys:
179
+ - endoscope_left
180
+ pass_through_keys: null
181
+ sin_cos_embedding_keys: null
182
+ mode: single_turn
183
+ multiprocessing_context: fork
184
+ num_shards_per_epoch: 100000
185
+ override_pretraining_statistics: true
186
+ random_chop: 0.0
187
+ seed: 42
188
+ shard_size: 1024
189
+ shuffle: true
190
+ subsample_ratio: 1.0
191
+ video_backend: torchcodec
192
+ load_config_path: null
193
+ model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
194
+ _attn_implementation_autoset: false
195
+ _attn_implementation_internal: null
196
+ _commit_hash: null
197
+ _name_or_path: ''
198
+ add_cross_attention: false
199
+ architectures: null
200
+ backbone_model_type: eagle
201
+ backbone_trainable_params_fp32: true
202
+ bad_words_ids: null
203
+ begin_suppress_tokens: null
204
+ bos_token_id: null
205
+ chunk_size_feed_forward: 0
206
+ color_jitter_params: null
207
+ cross_attention_hidden_size: null
208
+ decoder_start_token_id: null
209
+ diffusion_model_cfg:
210
+ attention_head_dim: 48
211
+ dropout: 0.2
212
+ final_dropout: true
213
+ interleave_self_attention: true
214
+ norm_type: ada_norm
215
+ num_attention_heads: 32
216
+ num_layers: 32
217
+ output_dim: 1024
218
+ positional_embeddings: null
219
+ diversity_penalty: 0.0
220
+ do_sample: false
221
+ eagle_collator: true
222
+ early_stopping: false
223
+ encoder_no_repeat_ngram_size: 0
224
+ eos_token_id: null
225
+ exponential_decay_length_penalty: null
226
+ extra_augmentation_config: null
227
+ finetuning_task: null
228
+ forced_bos_token_id: null
229
+ forced_eos_token_id: null
230
+ id2label:
231
+ 0: LABEL_0
232
+ 1: LABEL_1
233
+ is_decoder: false
234
+ is_encoder_decoder: false
235
+ label2id:
236
+ LABEL_0: 0
237
+ LABEL_1: 1
238
+ length_penalty: 1.0
239
+ load_bf16: false
240
+ max_length: 20
241
+ min_length: 0
242
+ model_name: nvidia/Eagle-Block2A-2B-v2
243
+ no_repeat_ngram_size: 0
244
+ num_beam_groups: 1
245
+ num_beams: 1
246
+ num_return_sequences: 1
247
+ output_attentions: false
248
+ output_hidden_states: false
249
+ output_scores: false
250
+ pad_token_id: null
251
+ prefix: null
252
+ problem_type: null
253
+ pruned_heads: {}
254
+ random_rotation_angle: null
255
+ remove_invalid_values: false
256
+ repetition_penalty: 1.0
257
+ reproject_vision: false
258
+ return_dict: true
259
+ return_dict_in_generate: false
260
+ sep_token_id: null
261
+ state_dropout_prob: 0.0
262
+ state_dropout_prob_per_embodiment: null
263
+ suppress_tokens: null
264
+ task_specific_params: null
265
+ temperature: 1.0
266
+ tf_legacy_loss: false
267
+ tie_encoder_decoder: false
268
+ tie_word_embeddings: true
269
+ tokenizer_class: null
270
+ top_k: 50
271
+ top_p: 1.0
272
+ torch_dtype: null
273
+ torchscript: false
274
+ transformers_version: null
275
+ tune_diffusion_model: true
276
+ tune_llm: false
277
+ tune_projector: true
278
+ tune_visual: false
279
+ typical_p: 1.0
280
+ use_bfloat16: false
281
+ use_relative_action: true
282
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
283
+ add_rl_callback: false
284
+ assert_loss_less_than: null
285
+ batch_size: null
286
+ bf16: true
287
+ dataloader_num_workers: 4
288
+ ddp_bucket_cap_mb: 100
289
+ deepspeed_stage: 2
290
+ enable_open_loop_eval: false
291
+ enable_profiling: false
292
+ eval_batch_size: 2
293
+ eval_bf16: true
294
+ eval_set_split_ratio: 0.1
295
+ eval_steps: 500
296
+ eval_strategy: 'no'
297
+ experiment_name: null
298
+ fp16: false
299
+ global_batch_size: 8
300
+ gradient_accumulation_steps: 1
301
+ gradient_checkpointing: false
302
+ learning_rate: 0.0001
303
+ logging_steps: 10
304
+ lr_scheduler_type: cosine
305
+ max_concurrent_uploads: 2
306
+ max_grad_norm: 1.0
307
+ max_retries: 3
308
+ max_steps: 100000
309
+ num_gpus: 1
310
+ open_loop_eval_plot_indices: null
311
+ open_loop_eval_steps_per_traj: 100
312
+ open_loop_eval_traj_ids:
313
+ - 0
314
+ optim: adamw_torch
315
+ output_dir: /hkfs/work/workspace/scratch/uenyr-thesis/outputs/groot_finetune_v2
316
+ remove_unused_columns: false
317
+ save_best_eval_metric_greater_is_better: true
318
+ save_best_eval_metric_name: ''
319
+ save_steps: 10000
320
+ save_total_limit: 5
321
+ save_vl_model: false
322
+ start_from_checkpoint: /hkfs/work/workspace/scratch/uenyr-thesis/pretrained/GR00T-H
323
+ tf32: true
324
+ transformers_access_token: null
325
+ transformers_cache_dir: null
326
+ transformers_local_files_only: false
327
+ transformers_trust_remote_code: true
328
+ upload_checkpoints: false
329
+ upload_every: 1000
330
+ upload_last_n_checkpoints: 5
331
+ use_ddp: false
332
+ use_wandb: false
333
+ wandb_project: finetune-gr00t-n1d6
334
+ warmup_ratio: 0.05
335
+ warmup_steps: 0
336
+ weight_decay: 1.0e-05
checkpoint-100000/experiment_cfg/dataset_statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100000/experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d6",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
+ "backbone_model_type": "eagle",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 4,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": true,
15
+ "collator_overwrite_image_inputs": false,
16
+ "eagle_collator": true,
17
+ "backbone_trainable_params_fp32": true,
18
+ "extra_augmentation_config": null,
19
+ "apply_sincos_state_encoding": true,
20
+ "use_relative_action": true,
21
+ "max_state_dim": 128,
22
+ "max_action_dim": 128,
23
+ "action_horizon": 50,
24
+ "hidden_size": 1024,
25
+ "input_embedding_dim": 1536,
26
+ "add_pos_embed": true,
27
+ "attn_dropout": 0.2,
28
+ "use_vlln": true,
29
+ "max_seq_len": 1024,
30
+ "use_alternate_vl_dit": true,
31
+ "attend_text_every_n_blocks": 2,
32
+ "diffusion_model_cfg": {
33
+ "attention_head_dim": 48,
34
+ "dropout": 0.2,
35
+ "final_dropout": true,
36
+ "interleave_self_attention": true,
37
+ "norm_type": "ada_norm",
38
+ "num_attention_heads": 32,
39
+ "num_layers": 32,
40
+ "output_dim": 1024,
41
+ "positional_embeddings": null
42
+ },
43
+ "num_inference_timesteps": 4,
44
+ "noise_beta_alpha": 1.5,
45
+ "noise_beta_beta": 1.0,
46
+ "noise_s": 0.999,
47
+ "num_timestep_buckets": 1000,
48
+ "tune_projector": true,
49
+ "tune_diffusion_model": true,
50
+ "tune_vlln": true,
51
+ "state_dropout_prob": 0.0,
52
+ "state_dropout_prob_per_embodiment": {
53
+ "cmr_versius": 1.0,
54
+ "jhu_imerse_dvrk": 1.0,
55
+ "obuda_dvrk": 1.0,
56
+ "stanford_dvrk_real": 1.0,
57
+ "ucb_dvrk": 1.0,
58
+ "ucsd_dvrk": 1.0,
59
+ "hamlyn_dvrk_30hz": 1.0,
60
+ "jhu_imerse_dvrk_mono": 1.0,
61
+ "jhu_imerse_star_il": 1.0,
62
+ "jhu_lscr_dvrk_smarts": 1.0,
63
+ "rob_surgical_bitrack": 1.0,
64
+ "tud_tundra_ur5e": 1.0,
65
+ "turin_mitic_ex_vivo": 1.0,
66
+ "ustc_torin_tuodao": 1.0
67
+ },
68
+ "state_additive_noise_scale": 0.0,
69
+ "max_num_embodiments": 32
70
+ }
checkpoint-100000/experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100000/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d585cc53c414d233c2cf7c2d75c661f6dfcfba24408fb56b9616be80d0b78725
3
+ size 4990126640
checkpoint-100000/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebffb80574459027bd073764b24cc6599ce44a51e553fae0e3f8fdc1b3c83aed
3
+ size 4823190320
checkpoint-100000/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100000/processor_config.json ADDED
@@ -0,0 +1,2816 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "min_max_embedding_keys": null,
18
+ "pass_through_keys": null,
19
+ "action_configs": null
20
+ },
21
+ "state": {
22
+ "delta_indices": [
23
+ 0
24
+ ],
25
+ "modality_keys": [
26
+ "robot_pos",
27
+ "robot_ori_cos",
28
+ "robot_ori_sin",
29
+ "robot_2d_ori",
30
+ "robot_2d_ori_cos",
31
+ "robot_2d_ori_sin",
32
+ "robot_lin_vel",
33
+ "robot_ang_vel",
34
+ "arm_left_qpos",
35
+ "arm_left_qpos_sin",
36
+ "arm_left_qpos_cos",
37
+ "eef_left_pos",
38
+ "eef_left_quat",
39
+ "gripper_left_qpos",
40
+ "arm_right_qpos",
41
+ "arm_right_qpos_sin",
42
+ "arm_right_qpos_cos",
43
+ "eef_right_pos",
44
+ "eef_right_quat",
45
+ "gripper_right_qpos",
46
+ "trunk_qpos"
47
+ ],
48
+ "sin_cos_embedding_keys": null,
49
+ "mean_std_embedding_keys": null,
50
+ "min_max_embedding_keys": null,
51
+ "pass_through_keys": null,
52
+ "action_configs": null
53
+ },
54
+ "action": {
55
+ "delta_indices": [
56
+ 0,
57
+ 1,
58
+ 2,
59
+ 3,
60
+ 4,
61
+ 5,
62
+ 6,
63
+ 7,
64
+ 8,
65
+ 9,
66
+ 10,
67
+ 11,
68
+ 12,
69
+ 13,
70
+ 14,
71
+ 15,
72
+ 16,
73
+ 17,
74
+ 18,
75
+ 19,
76
+ 20,
77
+ 21,
78
+ 22,
79
+ 23,
80
+ 24,
81
+ 25,
82
+ 26,
83
+ 27,
84
+ 28,
85
+ 29,
86
+ 30,
87
+ 31
88
+ ],
89
+ "modality_keys": [
90
+ "base",
91
+ "torso",
92
+ "left_arm",
93
+ "left_gripper",
94
+ "right_arm",
95
+ "right_gripper"
96
+ ],
97
+ "sin_cos_embedding_keys": null,
98
+ "mean_std_embedding_keys": null,
99
+ "min_max_embedding_keys": null,
100
+ "pass_through_keys": null,
101
+ "action_configs": [
102
+ {
103
+ "rep": "ABSOLUTE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": null,
107
+ "input_rotation_format": "quat",
108
+ "input_quat_order": "xyzw",
109
+ "reference_rotation_format": "rot6d",
110
+ "reference_quat_order": "xyzw",
111
+ "translation_scaling_key": null,
112
+ "rotation_scaling_key": null,
113
+ "hold_through_clutch": false,
114
+ "normalization_type": "percentile"
115
+ },
116
+ {
117
+ "rep": "RELATIVE",
118
+ "type": "NON_EEF",
119
+ "format": "DEFAULT",
120
+ "state_key": "trunk_qpos",
121
+ "input_rotation_format": "quat",
122
+ "input_quat_order": "xyzw",
123
+ "reference_rotation_format": "rot6d",
124
+ "reference_quat_order": "xyzw",
125
+ "translation_scaling_key": null,
126
+ "rotation_scaling_key": null,
127
+ "hold_through_clutch": false,
128
+ "normalization_type": "percentile"
129
+ },
130
+ {
131
+ "rep": "RELATIVE",
132
+ "type": "NON_EEF",
133
+ "format": "DEFAULT",
134
+ "state_key": "arm_left_qpos",
135
+ "input_rotation_format": "quat",
136
+ "input_quat_order": "xyzw",
137
+ "reference_rotation_format": "rot6d",
138
+ "reference_quat_order": "xyzw",
139
+ "translation_scaling_key": null,
140
+ "rotation_scaling_key": null,
141
+ "hold_through_clutch": false,
142
+ "normalization_type": "percentile"
143
+ },
144
+ {
145
+ "rep": "ABSOLUTE",
146
+ "type": "NON_EEF",
147
+ "format": "DEFAULT",
148
+ "state_key": null,
149
+ "input_rotation_format": "quat",
150
+ "input_quat_order": "xyzw",
151
+ "reference_rotation_format": "rot6d",
152
+ "reference_quat_order": "xyzw",
153
+ "translation_scaling_key": null,
154
+ "rotation_scaling_key": null,
155
+ "hold_through_clutch": false,
156
+ "normalization_type": "percentile"
157
+ },
158
+ {
159
+ "rep": "RELATIVE",
160
+ "type": "NON_EEF",
161
+ "format": "DEFAULT",
162
+ "state_key": "arm_right_qpos",
163
+ "input_rotation_format": "quat",
164
+ "input_quat_order": "xyzw",
165
+ "reference_rotation_format": "rot6d",
166
+ "reference_quat_order": "xyzw",
167
+ "translation_scaling_key": null,
168
+ "rotation_scaling_key": null,
169
+ "hold_through_clutch": false,
170
+ "normalization_type": "percentile"
171
+ },
172
+ {
173
+ "rep": "ABSOLUTE",
174
+ "type": "NON_EEF",
175
+ "format": "DEFAULT",
176
+ "state_key": null,
177
+ "input_rotation_format": "quat",
178
+ "input_quat_order": "xyzw",
179
+ "reference_rotation_format": "rot6d",
180
+ "reference_quat_order": "xyzw",
181
+ "translation_scaling_key": null,
182
+ "rotation_scaling_key": null,
183
+ "hold_through_clutch": false,
184
+ "normalization_type": "percentile"
185
+ }
186
+ ]
187
+ },
188
+ "language": {
189
+ "delta_indices": [
190
+ 0
191
+ ],
192
+ "modality_keys": [
193
+ "annotation.human.coarse_action"
194
+ ],
195
+ "sin_cos_embedding_keys": null,
196
+ "mean_std_embedding_keys": null,
197
+ "min_max_embedding_keys": null,
198
+ "pass_through_keys": null,
199
+ "action_configs": null
200
+ }
201
+ },
202
+ "gr1": {
203
+ "video": {
204
+ "delta_indices": [
205
+ 0
206
+ ],
207
+ "modality_keys": [
208
+ "ego_view_bg_crop_pad_res256_freq20"
209
+ ],
210
+ "sin_cos_embedding_keys": null,
211
+ "mean_std_embedding_keys": null,
212
+ "min_max_embedding_keys": null,
213
+ "pass_through_keys": null,
214
+ "action_configs": null
215
+ },
216
+ "state": {
217
+ "delta_indices": [
218
+ 0
219
+ ],
220
+ "modality_keys": [
221
+ "left_arm",
222
+ "right_arm",
223
+ "left_hand",
224
+ "right_hand",
225
+ "waist"
226
+ ],
227
+ "sin_cos_embedding_keys": [
228
+ "left_arm",
229
+ "right_arm",
230
+ "left_hand",
231
+ "right_hand",
232
+ "waist"
233
+ ],
234
+ "mean_std_embedding_keys": null,
235
+ "min_max_embedding_keys": null,
236
+ "pass_through_keys": null,
237
+ "action_configs": null
238
+ },
239
+ "action": {
240
+ "delta_indices": [
241
+ 0,
242
+ 1,
243
+ 2,
244
+ 3,
245
+ 4,
246
+ 5,
247
+ 6,
248
+ 7,
249
+ 8,
250
+ 9,
251
+ 10,
252
+ 11,
253
+ 12,
254
+ 13,
255
+ 14,
256
+ 15
257
+ ],
258
+ "modality_keys": [
259
+ "left_arm",
260
+ "right_arm",
261
+ "left_hand",
262
+ "right_hand",
263
+ "waist"
264
+ ],
265
+ "sin_cos_embedding_keys": null,
266
+ "mean_std_embedding_keys": null,
267
+ "min_max_embedding_keys": null,
268
+ "pass_through_keys": null,
269
+ "action_configs": [
270
+ {
271
+ "rep": "RELATIVE",
272
+ "type": "NON_EEF",
273
+ "format": "DEFAULT",
274
+ "state_key": null,
275
+ "input_rotation_format": "quat",
276
+ "input_quat_order": "xyzw",
277
+ "reference_rotation_format": "rot6d",
278
+ "reference_quat_order": "xyzw",
279
+ "translation_scaling_key": null,
280
+ "rotation_scaling_key": null,
281
+ "hold_through_clutch": false,
282
+ "normalization_type": "percentile"
283
+ },
284
+ {
285
+ "rep": "RELATIVE",
286
+ "type": "NON_EEF",
287
+ "format": "DEFAULT",
288
+ "state_key": null,
289
+ "input_rotation_format": "quat",
290
+ "input_quat_order": "xyzw",
291
+ "reference_rotation_format": "rot6d",
292
+ "reference_quat_order": "xyzw",
293
+ "translation_scaling_key": null,
294
+ "rotation_scaling_key": null,
295
+ "hold_through_clutch": false,
296
+ "normalization_type": "percentile"
297
+ },
298
+ {
299
+ "rep": "RELATIVE",
300
+ "type": "NON_EEF",
301
+ "format": "DEFAULT",
302
+ "state_key": null,
303
+ "input_rotation_format": "quat",
304
+ "input_quat_order": "xyzw",
305
+ "reference_rotation_format": "rot6d",
306
+ "reference_quat_order": "xyzw",
307
+ "translation_scaling_key": null,
308
+ "rotation_scaling_key": null,
309
+ "hold_through_clutch": false,
310
+ "normalization_type": "percentile"
311
+ },
312
+ {
313
+ "rep": "RELATIVE",
314
+ "type": "NON_EEF",
315
+ "format": "DEFAULT",
316
+ "state_key": null,
317
+ "input_rotation_format": "quat",
318
+ "input_quat_order": "xyzw",
319
+ "reference_rotation_format": "rot6d",
320
+ "reference_quat_order": "xyzw",
321
+ "translation_scaling_key": null,
322
+ "rotation_scaling_key": null,
323
+ "hold_through_clutch": false,
324
+ "normalization_type": "percentile"
325
+ },
326
+ {
327
+ "rep": "ABSOLUTE",
328
+ "type": "NON_EEF",
329
+ "format": "DEFAULT",
330
+ "state_key": null,
331
+ "input_rotation_format": "quat",
332
+ "input_quat_order": "xyzw",
333
+ "reference_rotation_format": "rot6d",
334
+ "reference_quat_order": "xyzw",
335
+ "translation_scaling_key": null,
336
+ "rotation_scaling_key": null,
337
+ "hold_through_clutch": false,
338
+ "normalization_type": "percentile"
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "task"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "min_max_embedding_keys": null,
352
+ "pass_through_keys": null,
353
+ "action_configs": null
354
+ }
355
+ },
356
+ "robocasa_panda_omron": {
357
+ "video": {
358
+ "delta_indices": [
359
+ 0
360
+ ],
361
+ "modality_keys": [
362
+ "res256_image_side_0",
363
+ "res256_image_side_1",
364
+ "res256_image_wrist_0"
365
+ ],
366
+ "sin_cos_embedding_keys": null,
367
+ "mean_std_embedding_keys": null,
368
+ "min_max_embedding_keys": null,
369
+ "pass_through_keys": null,
370
+ "action_configs": null
371
+ },
372
+ "state": {
373
+ "delta_indices": [
374
+ 0
375
+ ],
376
+ "modality_keys": [
377
+ "end_effector_position_relative",
378
+ "end_effector_rotation_relative",
379
+ "gripper_qpos",
380
+ "base_position",
381
+ "base_rotation"
382
+ ],
383
+ "sin_cos_embedding_keys": null,
384
+ "mean_std_embedding_keys": null,
385
+ "min_max_embedding_keys": null,
386
+ "pass_through_keys": null,
387
+ "action_configs": null
388
+ },
389
+ "action": {
390
+ "delta_indices": [
391
+ 0,
392
+ 1,
393
+ 2,
394
+ 3,
395
+ 4,
396
+ 5,
397
+ 6,
398
+ 7,
399
+ 8,
400
+ 9,
401
+ 10,
402
+ 11,
403
+ 12,
404
+ 13,
405
+ 14,
406
+ 15
407
+ ],
408
+ "modality_keys": [
409
+ "end_effector_position",
410
+ "end_effector_rotation",
411
+ "gripper_close",
412
+ "base_motion",
413
+ "control_mode"
414
+ ],
415
+ "sin_cos_embedding_keys": null,
416
+ "mean_std_embedding_keys": null,
417
+ "min_max_embedding_keys": null,
418
+ "pass_through_keys": null,
419
+ "action_configs": [
420
+ {
421
+ "rep": "ABSOLUTE",
422
+ "type": "NON_EEF",
423
+ "format": "DEFAULT",
424
+ "state_key": null,
425
+ "input_rotation_format": "quat",
426
+ "input_quat_order": "xyzw",
427
+ "reference_rotation_format": "rot6d",
428
+ "reference_quat_order": "xyzw",
429
+ "translation_scaling_key": null,
430
+ "rotation_scaling_key": null,
431
+ "hold_through_clutch": false,
432
+ "normalization_type": "percentile"
433
+ },
434
+ {
435
+ "rep": "ABSOLUTE",
436
+ "type": "NON_EEF",
437
+ "format": "DEFAULT",
438
+ "state_key": null,
439
+ "input_rotation_format": "quat",
440
+ "input_quat_order": "xyzw",
441
+ "reference_rotation_format": "rot6d",
442
+ "reference_quat_order": "xyzw",
443
+ "translation_scaling_key": null,
444
+ "rotation_scaling_key": null,
445
+ "hold_through_clutch": false,
446
+ "normalization_type": "percentile"
447
+ },
448
+ {
449
+ "rep": "ABSOLUTE",
450
+ "type": "NON_EEF",
451
+ "format": "DEFAULT",
452
+ "state_key": null,
453
+ "input_rotation_format": "quat",
454
+ "input_quat_order": "xyzw",
455
+ "reference_rotation_format": "rot6d",
456
+ "reference_quat_order": "xyzw",
457
+ "translation_scaling_key": null,
458
+ "rotation_scaling_key": null,
459
+ "hold_through_clutch": false,
460
+ "normalization_type": "percentile"
461
+ },
462
+ {
463
+ "rep": "ABSOLUTE",
464
+ "type": "NON_EEF",
465
+ "format": "DEFAULT",
466
+ "state_key": null,
467
+ "input_rotation_format": "quat",
468
+ "input_quat_order": "xyzw",
469
+ "reference_rotation_format": "rot6d",
470
+ "reference_quat_order": "xyzw",
471
+ "translation_scaling_key": null,
472
+ "rotation_scaling_key": null,
473
+ "hold_through_clutch": false,
474
+ "normalization_type": "percentile"
475
+ },
476
+ {
477
+ "rep": "ABSOLUTE",
478
+ "type": "NON_EEF",
479
+ "format": "DEFAULT",
480
+ "state_key": null,
481
+ "input_rotation_format": "quat",
482
+ "input_quat_order": "xyzw",
483
+ "reference_rotation_format": "rot6d",
484
+ "reference_quat_order": "xyzw",
485
+ "translation_scaling_key": null,
486
+ "rotation_scaling_key": null,
487
+ "hold_through_clutch": false,
488
+ "normalization_type": "percentile"
489
+ }
490
+ ]
491
+ },
492
+ "language": {
493
+ "delta_indices": [
494
+ 0
495
+ ],
496
+ "modality_keys": [
497
+ "annotation.human.action.task_description"
498
+ ],
499
+ "sin_cos_embedding_keys": null,
500
+ "mean_std_embedding_keys": null,
501
+ "min_max_embedding_keys": null,
502
+ "pass_through_keys": null,
503
+ "action_configs": null
504
+ }
505
+ },
506
+ "cmr_versius": {
507
+ "video": {
508
+ "delta_indices": [
509
+ 0
510
+ ],
511
+ "modality_keys": [
512
+ "endoscope"
513
+ ],
514
+ "sin_cos_embedding_keys": null,
515
+ "mean_std_embedding_keys": null,
516
+ "min_max_embedding_keys": null,
517
+ "pass_through_keys": null,
518
+ "action_configs": null
519
+ },
520
+ "state": {
521
+ "delta_indices": [
522
+ 0
523
+ ],
524
+ "modality_keys": [
525
+ "left_pose",
526
+ "left_gripper",
527
+ "right_pose",
528
+ "right_gripper",
529
+ "translation_scaling",
530
+ "rotation_scaling",
531
+ "hapticengaged_left",
532
+ "hapticengaged_right"
533
+ ],
534
+ "sin_cos_embedding_keys": null,
535
+ "mean_std_embedding_keys": [
536
+ "left_pose",
537
+ "left_gripper",
538
+ "right_pose",
539
+ "right_gripper"
540
+ ],
541
+ "min_max_embedding_keys": null,
542
+ "pass_through_keys": [
543
+ "translation_scaling",
544
+ "rotation_scaling",
545
+ "hapticengaged_left",
546
+ "hapticengaged_right"
547
+ ],
548
+ "action_configs": null
549
+ },
550
+ "action": {
551
+ "delta_indices": [
552
+ 2,
553
+ 4,
554
+ 6,
555
+ 8,
556
+ 10,
557
+ 12,
558
+ 14,
559
+ 16,
560
+ 18,
561
+ 20,
562
+ 22,
563
+ 24,
564
+ 26,
565
+ 28,
566
+ 30,
567
+ 32,
568
+ 34,
569
+ 36,
570
+ 38,
571
+ 40,
572
+ 42,
573
+ 44,
574
+ 46,
575
+ 48,
576
+ 50,
577
+ 52,
578
+ 54,
579
+ 56,
580
+ 58,
581
+ 60,
582
+ 62,
583
+ 64,
584
+ 66,
585
+ 68,
586
+ 70,
587
+ 72,
588
+ 74,
589
+ 76,
590
+ 78,
591
+ 80,
592
+ 82,
593
+ 84,
594
+ 86,
595
+ 88,
596
+ 90,
597
+ 92,
598
+ 94,
599
+ 96,
600
+ 98,
601
+ 100
602
+ ],
603
+ "modality_keys": [
604
+ "left_pose",
605
+ "left_gripper",
606
+ "right_pose",
607
+ "right_gripper",
608
+ "hapticengaged_left",
609
+ "hapticengaged_right"
610
+ ],
611
+ "sin_cos_embedding_keys": null,
612
+ "mean_std_embedding_keys": null,
613
+ "min_max_embedding_keys": null,
614
+ "pass_through_keys": [
615
+ "hapticengaged_left",
616
+ "hapticengaged_right"
617
+ ],
618
+ "action_configs": [
619
+ {
620
+ "rep": "REL_XYZ_ROT6D",
621
+ "type": "EEF",
622
+ "format": "XYZ_ROT6D",
623
+ "state_key": "left_pose",
624
+ "input_rotation_format": "quat",
625
+ "input_quat_order": "xyzw",
626
+ "reference_rotation_format": "quat",
627
+ "reference_quat_order": "xyzw",
628
+ "translation_scaling_key": "translation_scaling",
629
+ "rotation_scaling_key": "rotation_scaling",
630
+ "hold_through_clutch": false,
631
+ "normalization_type": "percentile"
632
+ },
633
+ {
634
+ "rep": "ABSOLUTE",
635
+ "type": "NON_EEF",
636
+ "format": "DEFAULT",
637
+ "state_key": "left_gripper",
638
+ "input_rotation_format": "quat",
639
+ "input_quat_order": "xyzw",
640
+ "reference_rotation_format": "rot6d",
641
+ "reference_quat_order": "xyzw",
642
+ "translation_scaling_key": null,
643
+ "rotation_scaling_key": null,
644
+ "hold_through_clutch": true,
645
+ "normalization_type": "percentile"
646
+ },
647
+ {
648
+ "rep": "REL_XYZ_ROT6D",
649
+ "type": "EEF",
650
+ "format": "XYZ_ROT6D",
651
+ "state_key": "right_pose",
652
+ "input_rotation_format": "quat",
653
+ "input_quat_order": "xyzw",
654
+ "reference_rotation_format": "quat",
655
+ "reference_quat_order": "xyzw",
656
+ "translation_scaling_key": "translation_scaling",
657
+ "rotation_scaling_key": "rotation_scaling",
658
+ "hold_through_clutch": false,
659
+ "normalization_type": "percentile"
660
+ },
661
+ {
662
+ "rep": "ABSOLUTE",
663
+ "type": "NON_EEF",
664
+ "format": "DEFAULT",
665
+ "state_key": "right_gripper",
666
+ "input_rotation_format": "quat",
667
+ "input_quat_order": "xyzw",
668
+ "reference_rotation_format": "rot6d",
669
+ "reference_quat_order": "xyzw",
670
+ "translation_scaling_key": null,
671
+ "rotation_scaling_key": null,
672
+ "hold_through_clutch": true,
673
+ "normalization_type": "percentile"
674
+ },
675
+ {
676
+ "rep": "ABSOLUTE",
677
+ "type": "NON_EEF",
678
+ "format": "DEFAULT",
679
+ "state_key": null,
680
+ "input_rotation_format": "quat",
681
+ "input_quat_order": "xyzw",
682
+ "reference_rotation_format": "rot6d",
683
+ "reference_quat_order": "xyzw",
684
+ "translation_scaling_key": null,
685
+ "rotation_scaling_key": null,
686
+ "hold_through_clutch": false,
687
+ "normalization_type": "skip"
688
+ },
689
+ {
690
+ "rep": "ABSOLUTE",
691
+ "type": "NON_EEF",
692
+ "format": "DEFAULT",
693
+ "state_key": null,
694
+ "input_rotation_format": "quat",
695
+ "input_quat_order": "xyzw",
696
+ "reference_rotation_format": "rot6d",
697
+ "reference_quat_order": "xyzw",
698
+ "translation_scaling_key": null,
699
+ "rotation_scaling_key": null,
700
+ "hold_through_clutch": false,
701
+ "normalization_type": "skip"
702
+ }
703
+ ]
704
+ },
705
+ "language": {
706
+ "delta_indices": [
707
+ 0
708
+ ],
709
+ "modality_keys": [
710
+ "annotation.human.task_description"
711
+ ],
712
+ "sin_cos_embedding_keys": null,
713
+ "mean_std_embedding_keys": null,
714
+ "min_max_embedding_keys": null,
715
+ "pass_through_keys": null,
716
+ "action_configs": null
717
+ }
718
+ },
719
+ "ucsd_dvrk": {
720
+ "video": {
721
+ "delta_indices": [
722
+ 0
723
+ ],
724
+ "modality_keys": [
725
+ "camera_left"
726
+ ],
727
+ "sin_cos_embedding_keys": null,
728
+ "mean_std_embedding_keys": null,
729
+ "min_max_embedding_keys": null,
730
+ "pass_through_keys": null,
731
+ "action_configs": null
732
+ },
733
+ "state": {
734
+ "delta_indices": [
735
+ 0
736
+ ],
737
+ "modality_keys": [
738
+ "psm_retraction_pose",
739
+ "psm_retraction_gripper",
740
+ "psm_cutter_pose",
741
+ "psm_cutter_gripper"
742
+ ],
743
+ "sin_cos_embedding_keys": null,
744
+ "mean_std_embedding_keys": [
745
+ "psm_retraction_pose",
746
+ "psm_retraction_gripper",
747
+ "psm_cutter_pose",
748
+ "psm_cutter_gripper"
749
+ ],
750
+ "min_max_embedding_keys": null,
751
+ "pass_through_keys": null,
752
+ "action_configs": null
753
+ },
754
+ "action": {
755
+ "delta_indices": [
756
+ 1,
757
+ 2,
758
+ 3,
759
+ 4,
760
+ 5,
761
+ 6,
762
+ 7,
763
+ 8,
764
+ 9,
765
+ 10,
766
+ 11,
767
+ 12,
768
+ 13,
769
+ 14,
770
+ 15,
771
+ 16,
772
+ 17,
773
+ 18,
774
+ 19,
775
+ 20,
776
+ 21,
777
+ 22,
778
+ 23,
779
+ 24,
780
+ 25,
781
+ 26,
782
+ 27,
783
+ 28,
784
+ 29,
785
+ 30,
786
+ 31,
787
+ 32,
788
+ 33,
789
+ 34,
790
+ 35,
791
+ 36,
792
+ 37,
793
+ 38,
794
+ 39,
795
+ 40,
796
+ 41,
797
+ 42,
798
+ 43,
799
+ 44,
800
+ 45,
801
+ 46,
802
+ 47,
803
+ 48,
804
+ 49,
805
+ 50
806
+ ],
807
+ "modality_keys": [
808
+ "psm_retraction_pose",
809
+ "psm_retraction_gripper",
810
+ "psm_cutter_pose",
811
+ "psm_cutter_gripper"
812
+ ],
813
+ "sin_cos_embedding_keys": null,
814
+ "mean_std_embedding_keys": null,
815
+ "min_max_embedding_keys": null,
816
+ "pass_through_keys": null,
817
+ "action_configs": [
818
+ {
819
+ "rep": "REL_XYZ_ROT6D",
820
+ "type": "EEF",
821
+ "format": "XYZ_ROT6D",
822
+ "state_key": "psm_retraction_pose",
823
+ "input_rotation_format": "quat",
824
+ "input_quat_order": "wxyz",
825
+ "reference_rotation_format": "quat",
826
+ "reference_quat_order": "wxyz",
827
+ "translation_scaling_key": null,
828
+ "rotation_scaling_key": null,
829
+ "hold_through_clutch": false,
830
+ "normalization_type": "percentile"
831
+ },
832
+ {
833
+ "rep": "ABSOLUTE",
834
+ "type": "NON_EEF",
835
+ "format": "DEFAULT",
836
+ "state_key": null,
837
+ "input_rotation_format": "quat",
838
+ "input_quat_order": "xyzw",
839
+ "reference_rotation_format": "rot6d",
840
+ "reference_quat_order": "xyzw",
841
+ "translation_scaling_key": null,
842
+ "rotation_scaling_key": null,
843
+ "hold_through_clutch": false,
844
+ "normalization_type": "percentile"
845
+ },
846
+ {
847
+ "rep": "REL_XYZ_ROT6D",
848
+ "type": "EEF",
849
+ "format": "XYZ_ROT6D",
850
+ "state_key": "psm_cutter_pose",
851
+ "input_rotation_format": "quat",
852
+ "input_quat_order": "wxyz",
853
+ "reference_rotation_format": "quat",
854
+ "reference_quat_order": "wxyz",
855
+ "translation_scaling_key": null,
856
+ "rotation_scaling_key": null,
857
+ "hold_through_clutch": false,
858
+ "normalization_type": "percentile"
859
+ },
860
+ {
861
+ "rep": "ABSOLUTE",
862
+ "type": "NON_EEF",
863
+ "format": "DEFAULT",
864
+ "state_key": null,
865
+ "input_rotation_format": "quat",
866
+ "input_quat_order": "xyzw",
867
+ "reference_rotation_format": "rot6d",
868
+ "reference_quat_order": "xyzw",
869
+ "translation_scaling_key": null,
870
+ "rotation_scaling_key": null,
871
+ "hold_through_clutch": false,
872
+ "normalization_type": "percentile"
873
+ }
874
+ ]
875
+ },
876
+ "language": {
877
+ "delta_indices": [
878
+ 0
879
+ ],
880
+ "modality_keys": [
881
+ "task"
882
+ ],
883
+ "sin_cos_embedding_keys": null,
884
+ "mean_std_embedding_keys": null,
885
+ "min_max_embedding_keys": null,
886
+ "pass_through_keys": null,
887
+ "action_configs": null
888
+ }
889
+ },
890
+ "jhu_imerse_dvrk": {
891
+ "video": {
892
+ "delta_indices": [
893
+ 0
894
+ ],
895
+ "modality_keys": [
896
+ "endoscope_left",
897
+ "wrist_left",
898
+ "wrist_right"
899
+ ],
900
+ "sin_cos_embedding_keys": null,
901
+ "mean_std_embedding_keys": null,
902
+ "min_max_embedding_keys": null,
903
+ "pass_through_keys": null,
904
+ "action_configs": null
905
+ },
906
+ "state": {
907
+ "delta_indices": [
908
+ 0
909
+ ],
910
+ "modality_keys": [
911
+ "psm1_pose",
912
+ "psm1_gripper",
913
+ "psm2_pose",
914
+ "psm2_gripper"
915
+ ],
916
+ "sin_cos_embedding_keys": null,
917
+ "mean_std_embedding_keys": [
918
+ "psm1_pose",
919
+ "psm1_gripper",
920
+ "psm2_pose",
921
+ "psm2_gripper"
922
+ ],
923
+ "min_max_embedding_keys": null,
924
+ "pass_through_keys": null,
925
+ "action_configs": null
926
+ },
927
+ "action": {
928
+ "delta_indices": [
929
+ 1,
930
+ 2,
931
+ 3,
932
+ 4,
933
+ 5,
934
+ 6,
935
+ 7,
936
+ 8,
937
+ 9,
938
+ 10,
939
+ 11,
940
+ 12,
941
+ 13,
942
+ 14,
943
+ 15,
944
+ 16,
945
+ 17,
946
+ 18,
947
+ 19,
948
+ 20,
949
+ 21,
950
+ 22,
951
+ 23,
952
+ 24,
953
+ 25,
954
+ 26,
955
+ 27,
956
+ 28,
957
+ 29,
958
+ 30,
959
+ 31,
960
+ 32,
961
+ 33,
962
+ 34,
963
+ 35,
964
+ 36,
965
+ 37,
966
+ 38,
967
+ 39,
968
+ 40,
969
+ 41,
970
+ 42,
971
+ 43,
972
+ 44,
973
+ 45,
974
+ 46,
975
+ 47,
976
+ 48,
977
+ 49,
978
+ 50
979
+ ],
980
+ "modality_keys": [
981
+ "psm1_pose",
982
+ "psm1_gripper",
983
+ "psm2_pose",
984
+ "psm2_gripper"
985
+ ],
986
+ "sin_cos_embedding_keys": null,
987
+ "mean_std_embedding_keys": null,
988
+ "min_max_embedding_keys": null,
989
+ "pass_through_keys": null,
990
+ "action_configs": [
991
+ {
992
+ "rep": "REL_XYZ_ROT6D",
993
+ "type": "EEF",
994
+ "format": "XYZ_ROT6D",
995
+ "state_key": "psm1_pose",
996
+ "input_rotation_format": "quat",
997
+ "input_quat_order": "xyzw",
998
+ "reference_rotation_format": "quat",
999
+ "reference_quat_order": "xyzw",
1000
+ "translation_scaling_key": null,
1001
+ "rotation_scaling_key": null,
1002
+ "hold_through_clutch": false,
1003
+ "normalization_type": "percentile"
1004
+ },
1005
+ {
1006
+ "rep": "ABSOLUTE",
1007
+ "type": "NON_EEF",
1008
+ "format": "DEFAULT",
1009
+ "state_key": null,
1010
+ "input_rotation_format": "quat",
1011
+ "input_quat_order": "xyzw",
1012
+ "reference_rotation_format": "rot6d",
1013
+ "reference_quat_order": "xyzw",
1014
+ "translation_scaling_key": null,
1015
+ "rotation_scaling_key": null,
1016
+ "hold_through_clutch": false,
1017
+ "normalization_type": "percentile"
1018
+ },
1019
+ {
1020
+ "rep": "REL_XYZ_ROT6D",
1021
+ "type": "EEF",
1022
+ "format": "XYZ_ROT6D",
1023
+ "state_key": "psm2_pose",
1024
+ "input_rotation_format": "quat",
1025
+ "input_quat_order": "xyzw",
1026
+ "reference_rotation_format": "quat",
1027
+ "reference_quat_order": "xyzw",
1028
+ "translation_scaling_key": null,
1029
+ "rotation_scaling_key": null,
1030
+ "hold_through_clutch": false,
1031
+ "normalization_type": "percentile"
1032
+ },
1033
+ {
1034
+ "rep": "ABSOLUTE",
1035
+ "type": "NON_EEF",
1036
+ "format": "DEFAULT",
1037
+ "state_key": null,
1038
+ "input_rotation_format": "quat",
1039
+ "input_quat_order": "xyzw",
1040
+ "reference_rotation_format": "rot6d",
1041
+ "reference_quat_order": "xyzw",
1042
+ "translation_scaling_key": null,
1043
+ "rotation_scaling_key": null,
1044
+ "hold_through_clutch": false,
1045
+ "normalization_type": "percentile"
1046
+ }
1047
+ ]
1048
+ },
1049
+ "language": {
1050
+ "delta_indices": [
1051
+ 0
1052
+ ],
1053
+ "modality_keys": [
1054
+ "annotation.human.task_description"
1055
+ ],
1056
+ "sin_cos_embedding_keys": null,
1057
+ "mean_std_embedding_keys": null,
1058
+ "min_max_embedding_keys": null,
1059
+ "pass_through_keys": null,
1060
+ "action_configs": null
1061
+ }
1062
+ },
1063
+ "obuda_dvrk": {
1064
+ "video": {
1065
+ "delta_indices": [
1066
+ 0
1067
+ ],
1068
+ "modality_keys": [
1069
+ "endoscope_left",
1070
+ "wrist_left",
1071
+ "wrist_right"
1072
+ ],
1073
+ "sin_cos_embedding_keys": null,
1074
+ "mean_std_embedding_keys": null,
1075
+ "min_max_embedding_keys": null,
1076
+ "pass_through_keys": null,
1077
+ "action_configs": null
1078
+ },
1079
+ "state": {
1080
+ "delta_indices": [
1081
+ 0
1082
+ ],
1083
+ "modality_keys": [
1084
+ "psm1_pose",
1085
+ "psm1_gripper",
1086
+ "psm2_pose",
1087
+ "psm2_gripper"
1088
+ ],
1089
+ "sin_cos_embedding_keys": null,
1090
+ "mean_std_embedding_keys": [
1091
+ "psm1_pose",
1092
+ "psm1_gripper",
1093
+ "psm2_pose",
1094
+ "psm2_gripper"
1095
+ ],
1096
+ "min_max_embedding_keys": null,
1097
+ "pass_through_keys": null,
1098
+ "action_configs": null
1099
+ },
1100
+ "action": {
1101
+ "delta_indices": [
1102
+ 0,
1103
+ 1,
1104
+ 2,
1105
+ 3,
1106
+ 4,
1107
+ 5,
1108
+ 6,
1109
+ 7,
1110
+ 8,
1111
+ 9,
1112
+ 10,
1113
+ 11,
1114
+ 12,
1115
+ 13,
1116
+ 14,
1117
+ 15,
1118
+ 16,
1119
+ 17,
1120
+ 18,
1121
+ 19,
1122
+ 20,
1123
+ 21,
1124
+ 22,
1125
+ 23,
1126
+ 24,
1127
+ 25,
1128
+ 26,
1129
+ 27,
1130
+ 28,
1131
+ 29,
1132
+ 30,
1133
+ 31,
1134
+ 32,
1135
+ 33,
1136
+ 34,
1137
+ 35,
1138
+ 36,
1139
+ 37,
1140
+ 38,
1141
+ 39,
1142
+ 40,
1143
+ 41,
1144
+ 42,
1145
+ 43,
1146
+ 44,
1147
+ 45,
1148
+ 46,
1149
+ 47,
1150
+ 48,
1151
+ 49
1152
+ ],
1153
+ "modality_keys": [
1154
+ "psm1_pose",
1155
+ "psm1_gripper",
1156
+ "psm2_pose",
1157
+ "psm2_gripper"
1158
+ ],
1159
+ "sin_cos_embedding_keys": null,
1160
+ "mean_std_embedding_keys": null,
1161
+ "min_max_embedding_keys": null,
1162
+ "pass_through_keys": null,
1163
+ "action_configs": [
1164
+ {
1165
+ "rep": "REL_XYZ_ROT6D",
1166
+ "type": "EEF",
1167
+ "format": "XYZ_ROT6D",
1168
+ "state_key": "psm1_pose",
1169
+ "input_rotation_format": "quat",
1170
+ "input_quat_order": "xyzw",
1171
+ "reference_rotation_format": "quat",
1172
+ "reference_quat_order": "xyzw",
1173
+ "translation_scaling_key": null,
1174
+ "rotation_scaling_key": null,
1175
+ "hold_through_clutch": false,
1176
+ "normalization_type": "percentile"
1177
+ },
1178
+ {
1179
+ "rep": "ABSOLUTE",
1180
+ "type": "NON_EEF",
1181
+ "format": "DEFAULT",
1182
+ "state_key": null,
1183
+ "input_rotation_format": "quat",
1184
+ "input_quat_order": "xyzw",
1185
+ "reference_rotation_format": "rot6d",
1186
+ "reference_quat_order": "xyzw",
1187
+ "translation_scaling_key": null,
1188
+ "rotation_scaling_key": null,
1189
+ "hold_through_clutch": false,
1190
+ "normalization_type": "percentile"
1191
+ },
1192
+ {
1193
+ "rep": "REL_XYZ_ROT6D",
1194
+ "type": "EEF",
1195
+ "format": "XYZ_ROT6D",
1196
+ "state_key": "psm2_pose",
1197
+ "input_rotation_format": "quat",
1198
+ "input_quat_order": "xyzw",
1199
+ "reference_rotation_format": "quat",
1200
+ "reference_quat_order": "xyzw",
1201
+ "translation_scaling_key": null,
1202
+ "rotation_scaling_key": null,
1203
+ "hold_through_clutch": false,
1204
+ "normalization_type": "percentile"
1205
+ },
1206
+ {
1207
+ "rep": "ABSOLUTE",
1208
+ "type": "NON_EEF",
1209
+ "format": "DEFAULT",
1210
+ "state_key": null,
1211
+ "input_rotation_format": "quat",
1212
+ "input_quat_order": "xyzw",
1213
+ "reference_rotation_format": "rot6d",
1214
+ "reference_quat_order": "xyzw",
1215
+ "translation_scaling_key": null,
1216
+ "rotation_scaling_key": null,
1217
+ "hold_through_clutch": false,
1218
+ "normalization_type": "percentile"
1219
+ }
1220
+ ]
1221
+ },
1222
+ "language": {
1223
+ "delta_indices": [
1224
+ 0
1225
+ ],
1226
+ "modality_keys": [
1227
+ "task"
1228
+ ],
1229
+ "sin_cos_embedding_keys": null,
1230
+ "mean_std_embedding_keys": null,
1231
+ "min_max_embedding_keys": null,
1232
+ "pass_through_keys": null,
1233
+ "action_configs": null
1234
+ }
1235
+ },
1236
+ "stanford_dvrk_real": {
1237
+ "video": {
1238
+ "delta_indices": [
1239
+ 0
1240
+ ],
1241
+ "modality_keys": [
1242
+ "endoscope_left"
1243
+ ],
1244
+ "sin_cos_embedding_keys": null,
1245
+ "mean_std_embedding_keys": null,
1246
+ "min_max_embedding_keys": null,
1247
+ "pass_through_keys": null,
1248
+ "action_configs": null
1249
+ },
1250
+ "state": {
1251
+ "delta_indices": [
1252
+ 0
1253
+ ],
1254
+ "modality_keys": [
1255
+ "psm1_pose",
1256
+ "psm1_gripper",
1257
+ "psm2_pose",
1258
+ "psm2_gripper"
1259
+ ],
1260
+ "sin_cos_embedding_keys": null,
1261
+ "mean_std_embedding_keys": [
1262
+ "psm1_pose",
1263
+ "psm1_gripper",
1264
+ "psm2_pose",
1265
+ "psm2_gripper"
1266
+ ],
1267
+ "min_max_embedding_keys": null,
1268
+ "pass_through_keys": null,
1269
+ "action_configs": null
1270
+ },
1271
+ "action": {
1272
+ "delta_indices": [
1273
+ 0,
1274
+ 1,
1275
+ 2,
1276
+ 3,
1277
+ 4,
1278
+ 5,
1279
+ 6,
1280
+ 7,
1281
+ 8,
1282
+ 9,
1283
+ 10,
1284
+ 11,
1285
+ 12,
1286
+ 13,
1287
+ 14,
1288
+ 15,
1289
+ 16,
1290
+ 17,
1291
+ 18,
1292
+ 19,
1293
+ 20,
1294
+ 21,
1295
+ 22,
1296
+ 23,
1297
+ 24,
1298
+ 25,
1299
+ 26,
1300
+ 27,
1301
+ 28,
1302
+ 29,
1303
+ 30,
1304
+ 31,
1305
+ 32,
1306
+ 33,
1307
+ 34,
1308
+ 35,
1309
+ 36,
1310
+ 37,
1311
+ 38,
1312
+ 39,
1313
+ 40,
1314
+ 41,
1315
+ 42,
1316
+ 43,
1317
+ 44,
1318
+ 45,
1319
+ 46,
1320
+ 47,
1321
+ 48,
1322
+ 49
1323
+ ],
1324
+ "modality_keys": [
1325
+ "psm1_pose",
1326
+ "psm1_gripper",
1327
+ "psm2_pose",
1328
+ "psm2_gripper"
1329
+ ],
1330
+ "sin_cos_embedding_keys": null,
1331
+ "mean_std_embedding_keys": null,
1332
+ "min_max_embedding_keys": null,
1333
+ "pass_through_keys": null,
1334
+ "action_configs": [
1335
+ {
1336
+ "rep": "REL_XYZ_ROT6D",
1337
+ "type": "EEF",
1338
+ "format": "XYZ_ROT6D",
1339
+ "state_key": "psm1_pose",
1340
+ "input_rotation_format": "euler",
1341
+ "input_quat_order": "xyzw",
1342
+ "reference_rotation_format": "euler",
1343
+ "reference_quat_order": "xyzw",
1344
+ "translation_scaling_key": null,
1345
+ "rotation_scaling_key": null,
1346
+ "hold_through_clutch": false,
1347
+ "normalization_type": "percentile"
1348
+ },
1349
+ {
1350
+ "rep": "ABSOLUTE",
1351
+ "type": "NON_EEF",
1352
+ "format": "DEFAULT",
1353
+ "state_key": null,
1354
+ "input_rotation_format": "quat",
1355
+ "input_quat_order": "xyzw",
1356
+ "reference_rotation_format": "rot6d",
1357
+ "reference_quat_order": "xyzw",
1358
+ "translation_scaling_key": null,
1359
+ "rotation_scaling_key": null,
1360
+ "hold_through_clutch": false,
1361
+ "normalization_type": "percentile"
1362
+ },
1363
+ {
1364
+ "rep": "REL_XYZ_ROT6D",
1365
+ "type": "EEF",
1366
+ "format": "XYZ_ROT6D",
1367
+ "state_key": "psm2_pose",
1368
+ "input_rotation_format": "euler",
1369
+ "input_quat_order": "xyzw",
1370
+ "reference_rotation_format": "euler",
1371
+ "reference_quat_order": "xyzw",
1372
+ "translation_scaling_key": null,
1373
+ "rotation_scaling_key": null,
1374
+ "hold_through_clutch": false,
1375
+ "normalization_type": "percentile"
1376
+ },
1377
+ {
1378
+ "rep": "ABSOLUTE",
1379
+ "type": "NON_EEF",
1380
+ "format": "DEFAULT",
1381
+ "state_key": null,
1382
+ "input_rotation_format": "quat",
1383
+ "input_quat_order": "xyzw",
1384
+ "reference_rotation_format": "rot6d",
1385
+ "reference_quat_order": "xyzw",
1386
+ "translation_scaling_key": null,
1387
+ "rotation_scaling_key": null,
1388
+ "hold_through_clutch": false,
1389
+ "normalization_type": "percentile"
1390
+ }
1391
+ ]
1392
+ },
1393
+ "language": {
1394
+ "delta_indices": [
1395
+ 0
1396
+ ],
1397
+ "modality_keys": [
1398
+ "task"
1399
+ ],
1400
+ "sin_cos_embedding_keys": null,
1401
+ "mean_std_embedding_keys": null,
1402
+ "min_max_embedding_keys": null,
1403
+ "pass_through_keys": null,
1404
+ "action_configs": null
1405
+ }
1406
+ },
1407
+ "tud_tundra_ur5e": {
1408
+ "video": {
1409
+ "delta_indices": [
1410
+ 0
1411
+ ],
1412
+ "modality_keys": [
1413
+ "laparoscope_left"
1414
+ ],
1415
+ "sin_cos_embedding_keys": null,
1416
+ "mean_std_embedding_keys": null,
1417
+ "min_max_embedding_keys": null,
1418
+ "pass_through_keys": null,
1419
+ "action_configs": null
1420
+ },
1421
+ "state": {
1422
+ "delta_indices": [
1423
+ 0
1424
+ ],
1425
+ "modality_keys": [
1426
+ "joint_position",
1427
+ "eef_pose"
1428
+ ],
1429
+ "sin_cos_embedding_keys": null,
1430
+ "mean_std_embedding_keys": [
1431
+ "joint_position"
1432
+ ],
1433
+ "min_max_embedding_keys": null,
1434
+ "pass_through_keys": [
1435
+ "eef_pose"
1436
+ ],
1437
+ "action_configs": null
1438
+ },
1439
+ "action": {
1440
+ "delta_indices": [
1441
+ 1,
1442
+ 2,
1443
+ 3,
1444
+ 4,
1445
+ 5,
1446
+ 6,
1447
+ 7,
1448
+ 8,
1449
+ 9,
1450
+ 10,
1451
+ 11,
1452
+ 12,
1453
+ 13,
1454
+ 14,
1455
+ 15,
1456
+ 16,
1457
+ 17,
1458
+ 18,
1459
+ 19,
1460
+ 20,
1461
+ 21,
1462
+ 22,
1463
+ 23,
1464
+ 24,
1465
+ 25,
1466
+ 26,
1467
+ 27,
1468
+ 28,
1469
+ 29,
1470
+ 30,
1471
+ 31,
1472
+ 32,
1473
+ 33,
1474
+ 34,
1475
+ 35,
1476
+ 36,
1477
+ 37,
1478
+ 38,
1479
+ 39,
1480
+ 40,
1481
+ 41,
1482
+ 42,
1483
+ 43,
1484
+ 44,
1485
+ 45,
1486
+ 46,
1487
+ 47,
1488
+ 48,
1489
+ 49,
1490
+ 50
1491
+ ],
1492
+ "modality_keys": [
1493
+ "eef_pose",
1494
+ "gripper"
1495
+ ],
1496
+ "sin_cos_embedding_keys": null,
1497
+ "mean_std_embedding_keys": null,
1498
+ "min_max_embedding_keys": null,
1499
+ "pass_through_keys": null,
1500
+ "action_configs": [
1501
+ {
1502
+ "rep": "REL_XYZ_ROT6D",
1503
+ "type": "EEF",
1504
+ "format": "XYZ_ROT6D",
1505
+ "state_key": "eef_pose",
1506
+ "input_rotation_format": "quat",
1507
+ "input_quat_order": "xyzw",
1508
+ "reference_rotation_format": "quat",
1509
+ "reference_quat_order": "xyzw",
1510
+ "translation_scaling_key": null,
1511
+ "rotation_scaling_key": null,
1512
+ "hold_through_clutch": false,
1513
+ "normalization_type": "percentile"
1514
+ },
1515
+ {
1516
+ "rep": "ABSOLUTE",
1517
+ "type": "NON_EEF",
1518
+ "format": "DEFAULT",
1519
+ "state_key": null,
1520
+ "input_rotation_format": "quat",
1521
+ "input_quat_order": "xyzw",
1522
+ "reference_rotation_format": "rot6d",
1523
+ "reference_quat_order": "xyzw",
1524
+ "translation_scaling_key": null,
1525
+ "rotation_scaling_key": null,
1526
+ "hold_through_clutch": false,
1527
+ "normalization_type": "percentile"
1528
+ }
1529
+ ]
1530
+ },
1531
+ "language": {
1532
+ "delta_indices": [
1533
+ 0
1534
+ ],
1535
+ "modality_keys": [
1536
+ "task"
1537
+ ],
1538
+ "sin_cos_embedding_keys": null,
1539
+ "mean_std_embedding_keys": null,
1540
+ "min_max_embedding_keys": null,
1541
+ "pass_through_keys": null,
1542
+ "action_configs": null
1543
+ }
1544
+ },
1545
+ "jhu_lscr_dvrk_smarts": {
1546
+ "video": {
1547
+ "delta_indices": [
1548
+ 0
1549
+ ],
1550
+ "modality_keys": [
1551
+ "endoscope_left",
1552
+ "camera_side_view"
1553
+ ],
1554
+ "sin_cos_embedding_keys": null,
1555
+ "mean_std_embedding_keys": null,
1556
+ "min_max_embedding_keys": null,
1557
+ "pass_through_keys": null,
1558
+ "action_configs": null
1559
+ },
1560
+ "state": {
1561
+ "delta_indices": [
1562
+ 0
1563
+ ],
1564
+ "modality_keys": [
1565
+ "psm1_pose",
1566
+ "psm1_gripper",
1567
+ "psm2_pose",
1568
+ "psm2_gripper"
1569
+ ],
1570
+ "sin_cos_embedding_keys": null,
1571
+ "mean_std_embedding_keys": [
1572
+ "psm1_pose",
1573
+ "psm1_gripper",
1574
+ "psm2_pose",
1575
+ "psm2_gripper"
1576
+ ],
1577
+ "min_max_embedding_keys": null,
1578
+ "pass_through_keys": null,
1579
+ "action_configs": null
1580
+ },
1581
+ "action": {
1582
+ "delta_indices": [
1583
+ 1,
1584
+ 2,
1585
+ 3,
1586
+ 4,
1587
+ 5,
1588
+ 6,
1589
+ 7,
1590
+ 8,
1591
+ 9,
1592
+ 10,
1593
+ 11,
1594
+ 12,
1595
+ 13,
1596
+ 14,
1597
+ 15,
1598
+ 16
1599
+ ],
1600
+ "modality_keys": [
1601
+ "psm1_pose",
1602
+ "psm1_gripper",
1603
+ "psm2_pose",
1604
+ "psm2_gripper"
1605
+ ],
1606
+ "sin_cos_embedding_keys": null,
1607
+ "mean_std_embedding_keys": null,
1608
+ "min_max_embedding_keys": null,
1609
+ "pass_through_keys": null,
1610
+ "action_configs": [
1611
+ {
1612
+ "rep": "REL_XYZ_ROT6D",
1613
+ "type": "EEF",
1614
+ "format": "XYZ_ROT6D",
1615
+ "state_key": "psm1_pose",
1616
+ "input_rotation_format": "quat",
1617
+ "input_quat_order": "xyzw",
1618
+ "reference_rotation_format": "quat",
1619
+ "reference_quat_order": "xyzw",
1620
+ "translation_scaling_key": null,
1621
+ "rotation_scaling_key": null,
1622
+ "hold_through_clutch": false,
1623
+ "normalization_type": "percentile"
1624
+ },
1625
+ {
1626
+ "rep": "ABSOLUTE",
1627
+ "type": "NON_EEF",
1628
+ "format": "DEFAULT",
1629
+ "state_key": null,
1630
+ "input_rotation_format": "quat",
1631
+ "input_quat_order": "xyzw",
1632
+ "reference_rotation_format": "rot6d",
1633
+ "reference_quat_order": "xyzw",
1634
+ "translation_scaling_key": null,
1635
+ "rotation_scaling_key": null,
1636
+ "hold_through_clutch": false,
1637
+ "normalization_type": "percentile"
1638
+ },
1639
+ {
1640
+ "rep": "REL_XYZ_ROT6D",
1641
+ "type": "EEF",
1642
+ "format": "XYZ_ROT6D",
1643
+ "state_key": "psm2_pose",
1644
+ "input_rotation_format": "quat",
1645
+ "input_quat_order": "xyzw",
1646
+ "reference_rotation_format": "quat",
1647
+ "reference_quat_order": "xyzw",
1648
+ "translation_scaling_key": null,
1649
+ "rotation_scaling_key": null,
1650
+ "hold_through_clutch": false,
1651
+ "normalization_type": "percentile"
1652
+ },
1653
+ {
1654
+ "rep": "ABSOLUTE",
1655
+ "type": "NON_EEF",
1656
+ "format": "DEFAULT",
1657
+ "state_key": null,
1658
+ "input_rotation_format": "quat",
1659
+ "input_quat_order": "xyzw",
1660
+ "reference_rotation_format": "rot6d",
1661
+ "reference_quat_order": "xyzw",
1662
+ "translation_scaling_key": null,
1663
+ "rotation_scaling_key": null,
1664
+ "hold_through_clutch": false,
1665
+ "normalization_type": "percentile"
1666
+ }
1667
+ ]
1668
+ },
1669
+ "language": {
1670
+ "delta_indices": [
1671
+ 0
1672
+ ],
1673
+ "modality_keys": [
1674
+ "annotation.task"
1675
+ ],
1676
+ "sin_cos_embedding_keys": null,
1677
+ "mean_std_embedding_keys": null,
1678
+ "min_max_embedding_keys": null,
1679
+ "pass_through_keys": null,
1680
+ "action_configs": null
1681
+ }
1682
+ },
1683
+ "jhu_imerse_dvrk_mono": {
1684
+ "video": {
1685
+ "delta_indices": [
1686
+ 0
1687
+ ],
1688
+ "modality_keys": [
1689
+ "endoscope_left"
1690
+ ],
1691
+ "sin_cos_embedding_keys": null,
1692
+ "mean_std_embedding_keys": null,
1693
+ "min_max_embedding_keys": null,
1694
+ "pass_through_keys": null,
1695
+ "action_configs": null
1696
+ },
1697
+ "state": {
1698
+ "delta_indices": [
1699
+ 0
1700
+ ],
1701
+ "modality_keys": [
1702
+ "psm1_pose",
1703
+ "psm1_gripper",
1704
+ "psm2_pose",
1705
+ "psm2_gripper"
1706
+ ],
1707
+ "sin_cos_embedding_keys": null,
1708
+ "mean_std_embedding_keys": [
1709
+ "psm1_pose",
1710
+ "psm1_gripper",
1711
+ "psm2_pose",
1712
+ "psm2_gripper"
1713
+ ],
1714
+ "min_max_embedding_keys": null,
1715
+ "pass_through_keys": null,
1716
+ "action_configs": null
1717
+ },
1718
+ "action": {
1719
+ "delta_indices": [
1720
+ 0,
1721
+ 1,
1722
+ 2,
1723
+ 3,
1724
+ 4,
1725
+ 5,
1726
+ 6,
1727
+ 7,
1728
+ 8,
1729
+ 9,
1730
+ 10,
1731
+ 11,
1732
+ 12,
1733
+ 13,
1734
+ 14,
1735
+ 15,
1736
+ 16,
1737
+ 17,
1738
+ 18,
1739
+ 19,
1740
+ 20,
1741
+ 21,
1742
+ 22,
1743
+ 23,
1744
+ 24,
1745
+ 25,
1746
+ 26,
1747
+ 27,
1748
+ 28,
1749
+ 29,
1750
+ 30,
1751
+ 31,
1752
+ 32,
1753
+ 33,
1754
+ 34,
1755
+ 35,
1756
+ 36,
1757
+ 37,
1758
+ 38,
1759
+ 39,
1760
+ 40,
1761
+ 41,
1762
+ 42,
1763
+ 43,
1764
+ 44,
1765
+ 45,
1766
+ 46,
1767
+ 47,
1768
+ 48,
1769
+ 49
1770
+ ],
1771
+ "modality_keys": [
1772
+ "psm1_pose",
1773
+ "psm1_gripper",
1774
+ "psm2_pose",
1775
+ "psm2_gripper"
1776
+ ],
1777
+ "sin_cos_embedding_keys": null,
1778
+ "mean_std_embedding_keys": null,
1779
+ "min_max_embedding_keys": null,
1780
+ "pass_through_keys": null,
1781
+ "action_configs": [
1782
+ {
1783
+ "rep": "REL_XYZ_ROT6D",
1784
+ "type": "EEF",
1785
+ "format": "XYZ_ROT6D",
1786
+ "state_key": "psm1_pose",
1787
+ "input_rotation_format": "quat",
1788
+ "input_quat_order": "xyzw",
1789
+ "reference_rotation_format": "quat",
1790
+ "reference_quat_order": "xyzw",
1791
+ "translation_scaling_key": null,
1792
+ "rotation_scaling_key": null,
1793
+ "hold_through_clutch": false,
1794
+ "normalization_type": "temporal_meanstd"
1795
+ },
1796
+ {
1797
+ "rep": "ABSOLUTE",
1798
+ "type": "NON_EEF",
1799
+ "format": "DEFAULT",
1800
+ "state_key": null,
1801
+ "input_rotation_format": "quat",
1802
+ "input_quat_order": "xyzw",
1803
+ "reference_rotation_format": "rot6d",
1804
+ "reference_quat_order": "xyzw",
1805
+ "translation_scaling_key": null,
1806
+ "rotation_scaling_key": null,
1807
+ "hold_through_clutch": false,
1808
+ "normalization_type": "temporal_meanstd"
1809
+ },
1810
+ {
1811
+ "rep": "REL_XYZ_ROT6D",
1812
+ "type": "EEF",
1813
+ "format": "XYZ_ROT6D",
1814
+ "state_key": "psm2_pose",
1815
+ "input_rotation_format": "quat",
1816
+ "input_quat_order": "xyzw",
1817
+ "reference_rotation_format": "quat",
1818
+ "reference_quat_order": "xyzw",
1819
+ "translation_scaling_key": null,
1820
+ "rotation_scaling_key": null,
1821
+ "hold_through_clutch": false,
1822
+ "normalization_type": "temporal_meanstd"
1823
+ },
1824
+ {
1825
+ "rep": "ABSOLUTE",
1826
+ "type": "NON_EEF",
1827
+ "format": "DEFAULT",
1828
+ "state_key": null,
1829
+ "input_rotation_format": "quat",
1830
+ "input_quat_order": "xyzw",
1831
+ "reference_rotation_format": "rot6d",
1832
+ "reference_quat_order": "xyzw",
1833
+ "translation_scaling_key": null,
1834
+ "rotation_scaling_key": null,
1835
+ "hold_through_clutch": false,
1836
+ "normalization_type": "temporal_meanstd"
1837
+ }
1838
+ ]
1839
+ },
1840
+ "language": {
1841
+ "delta_indices": [
1842
+ 0
1843
+ ],
1844
+ "modality_keys": [
1845
+ "annotation.human.task_description"
1846
+ ],
1847
+ "sin_cos_embedding_keys": null,
1848
+ "mean_std_embedding_keys": null,
1849
+ "min_max_embedding_keys": null,
1850
+ "pass_through_keys": null,
1851
+ "action_configs": null
1852
+ }
1853
+ },
1854
+ "rob_surgical_bitrack": {
1855
+ "video": {
1856
+ "delta_indices": [
1857
+ 0
1858
+ ],
1859
+ "modality_keys": [
1860
+ "endoscope"
1861
+ ],
1862
+ "sin_cos_embedding_keys": null,
1863
+ "mean_std_embedding_keys": null,
1864
+ "min_max_embedding_keys": null,
1865
+ "pass_through_keys": null,
1866
+ "action_configs": null
1867
+ },
1868
+ "state": {
1869
+ "delta_indices": [
1870
+ 0
1871
+ ],
1872
+ "modality_keys": [
1873
+ "left_pose",
1874
+ "right_pose",
1875
+ "aux_pose"
1876
+ ],
1877
+ "sin_cos_embedding_keys": null,
1878
+ "mean_std_embedding_keys": [
1879
+ "left_pose",
1880
+ "right_pose",
1881
+ "aux_pose"
1882
+ ],
1883
+ "min_max_embedding_keys": null,
1884
+ "pass_through_keys": null,
1885
+ "action_configs": null
1886
+ },
1887
+ "action": {
1888
+ "delta_indices": [
1889
+ 0,
1890
+ 1,
1891
+ 2,
1892
+ 3,
1893
+ 4,
1894
+ 5,
1895
+ 6,
1896
+ 7,
1897
+ 8,
1898
+ 9,
1899
+ 10,
1900
+ 11,
1901
+ 12,
1902
+ 13,
1903
+ 14,
1904
+ 15,
1905
+ 16,
1906
+ 17,
1907
+ 18,
1908
+ 19,
1909
+ 20,
1910
+ 21,
1911
+ 22,
1912
+ 23,
1913
+ 24,
1914
+ 25,
1915
+ 26,
1916
+ 27,
1917
+ 28,
1918
+ 29,
1919
+ 30,
1920
+ 31,
1921
+ 32,
1922
+ 33,
1923
+ 34,
1924
+ 35,
1925
+ 36,
1926
+ 37,
1927
+ 38,
1928
+ 39,
1929
+ 40,
1930
+ 41,
1931
+ 42,
1932
+ 43,
1933
+ 44,
1934
+ 45,
1935
+ 46,
1936
+ 47,
1937
+ 48,
1938
+ 49
1939
+ ],
1940
+ "modality_keys": [
1941
+ "left_pose",
1942
+ "right_pose",
1943
+ "aux_pose"
1944
+ ],
1945
+ "sin_cos_embedding_keys": null,
1946
+ "mean_std_embedding_keys": null,
1947
+ "min_max_embedding_keys": null,
1948
+ "pass_through_keys": null,
1949
+ "action_configs": [
1950
+ {
1951
+ "rep": "REL_XYZ_ROT6D",
1952
+ "type": "EEF",
1953
+ "format": "XYZ_ROT6D",
1954
+ "state_key": "left_pose",
1955
+ "input_rotation_format": "euler",
1956
+ "input_quat_order": "xyzw",
1957
+ "reference_rotation_format": "euler",
1958
+ "reference_quat_order": "xyzw",
1959
+ "translation_scaling_key": null,
1960
+ "rotation_scaling_key": null,
1961
+ "hold_through_clutch": false,
1962
+ "normalization_type": "percentile"
1963
+ },
1964
+ {
1965
+ "rep": "REL_XYZ_ROT6D",
1966
+ "type": "EEF",
1967
+ "format": "XYZ_ROT6D",
1968
+ "state_key": "right_pose",
1969
+ "input_rotation_format": "euler",
1970
+ "input_quat_order": "xyzw",
1971
+ "reference_rotation_format": "euler",
1972
+ "reference_quat_order": "xyzw",
1973
+ "translation_scaling_key": null,
1974
+ "rotation_scaling_key": null,
1975
+ "hold_through_clutch": false,
1976
+ "normalization_type": "percentile"
1977
+ },
1978
+ {
1979
+ "rep": "REL_XYZ_ROT6D",
1980
+ "type": "EEF",
1981
+ "format": "XYZ_ROT6D",
1982
+ "state_key": "aux_pose",
1983
+ "input_rotation_format": "euler",
1984
+ "input_quat_order": "xyzw",
1985
+ "reference_rotation_format": "euler",
1986
+ "reference_quat_order": "xyzw",
1987
+ "translation_scaling_key": null,
1988
+ "rotation_scaling_key": null,
1989
+ "hold_through_clutch": false,
1990
+ "normalization_type": "percentile"
1991
+ }
1992
+ ]
1993
+ },
1994
+ "language": {
1995
+ "delta_indices": [
1996
+ 0
1997
+ ],
1998
+ "modality_keys": [
1999
+ "annotation.instruction"
2000
+ ],
2001
+ "sin_cos_embedding_keys": null,
2002
+ "mean_std_embedding_keys": null,
2003
+ "min_max_embedding_keys": null,
2004
+ "pass_through_keys": null,
2005
+ "action_configs": null
2006
+ }
2007
+ },
2008
+ "turin_mitic_ex_vivo": {
2009
+ "video": {
2010
+ "delta_indices": [
2011
+ 0
2012
+ ],
2013
+ "modality_keys": [
2014
+ "endoscope_left"
2015
+ ],
2016
+ "sin_cos_embedding_keys": null,
2017
+ "mean_std_embedding_keys": null,
2018
+ "min_max_embedding_keys": null,
2019
+ "pass_through_keys": null,
2020
+ "action_configs": null
2021
+ },
2022
+ "state": {
2023
+ "delta_indices": [
2024
+ 0
2025
+ ],
2026
+ "modality_keys": [
2027
+ "psm1_joints",
2028
+ "psm2_joints",
2029
+ "psm1_pose",
2030
+ "psm2_pose"
2031
+ ],
2032
+ "sin_cos_embedding_keys": null,
2033
+ "mean_std_embedding_keys": [
2034
+ "psm1_joints",
2035
+ "psm2_joints"
2036
+ ],
2037
+ "min_max_embedding_keys": null,
2038
+ "pass_through_keys": [
2039
+ "psm1_pose",
2040
+ "psm2_pose"
2041
+ ],
2042
+ "action_configs": null
2043
+ },
2044
+ "action": {
2045
+ "delta_indices": [
2046
+ 1,
2047
+ 2,
2048
+ 3,
2049
+ 4,
2050
+ 5,
2051
+ 6,
2052
+ 7,
2053
+ 8,
2054
+ 9,
2055
+ 10,
2056
+ 11,
2057
+ 12,
2058
+ 13,
2059
+ 14,
2060
+ 15,
2061
+ 16,
2062
+ 17,
2063
+ 18,
2064
+ 19,
2065
+ 20,
2066
+ 21,
2067
+ 22,
2068
+ 23,
2069
+ 24,
2070
+ 25,
2071
+ 26,
2072
+ 27,
2073
+ 28,
2074
+ 29,
2075
+ 30,
2076
+ 31,
2077
+ 32,
2078
+ 33,
2079
+ 34,
2080
+ 35,
2081
+ 36,
2082
+ 37,
2083
+ 38,
2084
+ 39,
2085
+ 40,
2086
+ 41,
2087
+ 42,
2088
+ 43,
2089
+ 44,
2090
+ 45,
2091
+ 46,
2092
+ 47,
2093
+ 48,
2094
+ 49,
2095
+ 50
2096
+ ],
2097
+ "modality_keys": [
2098
+ "psm1_pose",
2099
+ "psm2_pose"
2100
+ ],
2101
+ "sin_cos_embedding_keys": null,
2102
+ "mean_std_embedding_keys": null,
2103
+ "min_max_embedding_keys": null,
2104
+ "pass_through_keys": null,
2105
+ "action_configs": [
2106
+ {
2107
+ "rep": "REL_XYZ_ROT6D",
2108
+ "type": "EEF",
2109
+ "format": "XYZ_ROT6D",
2110
+ "state_key": "psm1_pose",
2111
+ "input_rotation_format": "quat",
2112
+ "input_quat_order": "xyzw",
2113
+ "reference_rotation_format": "quat",
2114
+ "reference_quat_order": "xyzw",
2115
+ "translation_scaling_key": null,
2116
+ "rotation_scaling_key": null,
2117
+ "hold_through_clutch": false,
2118
+ "normalization_type": "percentile"
2119
+ },
2120
+ {
2121
+ "rep": "REL_XYZ_ROT6D",
2122
+ "type": "EEF",
2123
+ "format": "XYZ_ROT6D",
2124
+ "state_key": "psm2_pose",
2125
+ "input_rotation_format": "quat",
2126
+ "input_quat_order": "xyzw",
2127
+ "reference_rotation_format": "quat",
2128
+ "reference_quat_order": "xyzw",
2129
+ "translation_scaling_key": null,
2130
+ "rotation_scaling_key": null,
2131
+ "hold_through_clutch": false,
2132
+ "normalization_type": "percentile"
2133
+ }
2134
+ ]
2135
+ },
2136
+ "language": {
2137
+ "delta_indices": [
2138
+ 0
2139
+ ],
2140
+ "modality_keys": [
2141
+ "annotation.instruction"
2142
+ ],
2143
+ "sin_cos_embedding_keys": null,
2144
+ "mean_std_embedding_keys": null,
2145
+ "min_max_embedding_keys": null,
2146
+ "pass_through_keys": null,
2147
+ "action_configs": null
2148
+ }
2149
+ },
2150
+ "ustc_torin_tuodao": {
2151
+ "video": {
2152
+ "delta_indices": [
2153
+ 0
2154
+ ],
2155
+ "modality_keys": [
2156
+ "endoscope_left"
2157
+ ],
2158
+ "sin_cos_embedding_keys": null,
2159
+ "mean_std_embedding_keys": null,
2160
+ "min_max_embedding_keys": null,
2161
+ "pass_through_keys": null,
2162
+ "action_configs": null
2163
+ },
2164
+ "state": {
2165
+ "delta_indices": [
2166
+ 0
2167
+ ],
2168
+ "modality_keys": [
2169
+ "left_joints",
2170
+ "right_joints",
2171
+ "left_pose",
2172
+ "right_pose"
2173
+ ],
2174
+ "sin_cos_embedding_keys": null,
2175
+ "mean_std_embedding_keys": [
2176
+ "left_joints",
2177
+ "right_joints"
2178
+ ],
2179
+ "min_max_embedding_keys": null,
2180
+ "pass_through_keys": [
2181
+ "left_pose",
2182
+ "right_pose"
2183
+ ],
2184
+ "action_configs": null
2185
+ },
2186
+ "action": {
2187
+ "delta_indices": [
2188
+ 0,
2189
+ 1,
2190
+ 2,
2191
+ 3,
2192
+ 4,
2193
+ 5,
2194
+ 6,
2195
+ 7,
2196
+ 8,
2197
+ 9,
2198
+ 10,
2199
+ 11,
2200
+ 12,
2201
+ 13,
2202
+ 14,
2203
+ 15,
2204
+ 16,
2205
+ 17,
2206
+ 18,
2207
+ 19,
2208
+ 20,
2209
+ 21,
2210
+ 22,
2211
+ 23,
2212
+ 24,
2213
+ 25,
2214
+ 26,
2215
+ 27,
2216
+ 28,
2217
+ 29,
2218
+ 30,
2219
+ 31,
2220
+ 32,
2221
+ 33,
2222
+ 34,
2223
+ 35,
2224
+ 36,
2225
+ 37,
2226
+ 38,
2227
+ 39,
2228
+ 40,
2229
+ 41,
2230
+ 42,
2231
+ 43,
2232
+ 44,
2233
+ 45,
2234
+ 46,
2235
+ 47,
2236
+ 48,
2237
+ 49
2238
+ ],
2239
+ "modality_keys": [
2240
+ "left_pose",
2241
+ "left_gripper",
2242
+ "right_pose",
2243
+ "right_gripper"
2244
+ ],
2245
+ "sin_cos_embedding_keys": null,
2246
+ "mean_std_embedding_keys": null,
2247
+ "min_max_embedding_keys": null,
2248
+ "pass_through_keys": null,
2249
+ "action_configs": [
2250
+ {
2251
+ "rep": "REL_XYZ_ROT6D",
2252
+ "type": "EEF",
2253
+ "format": "XYZ_ROT6D",
2254
+ "state_key": "left_pose",
2255
+ "input_rotation_format": "quat",
2256
+ "input_quat_order": "xyzw",
2257
+ "reference_rotation_format": "quat",
2258
+ "reference_quat_order": "xyzw",
2259
+ "translation_scaling_key": null,
2260
+ "rotation_scaling_key": null,
2261
+ "hold_through_clutch": false,
2262
+ "normalization_type": "percentile"
2263
+ },
2264
+ {
2265
+ "rep": "ABSOLUTE",
2266
+ "type": "NON_EEF",
2267
+ "format": "DEFAULT",
2268
+ "state_key": null,
2269
+ "input_rotation_format": "quat",
2270
+ "input_quat_order": "xyzw",
2271
+ "reference_rotation_format": "rot6d",
2272
+ "reference_quat_order": "xyzw",
2273
+ "translation_scaling_key": null,
2274
+ "rotation_scaling_key": null,
2275
+ "hold_through_clutch": false,
2276
+ "normalization_type": "percentile"
2277
+ },
2278
+ {
2279
+ "rep": "REL_XYZ_ROT6D",
2280
+ "type": "EEF",
2281
+ "format": "XYZ_ROT6D",
2282
+ "state_key": "right_pose",
2283
+ "input_rotation_format": "quat",
2284
+ "input_quat_order": "xyzw",
2285
+ "reference_rotation_format": "quat",
2286
+ "reference_quat_order": "xyzw",
2287
+ "translation_scaling_key": null,
2288
+ "rotation_scaling_key": null,
2289
+ "hold_through_clutch": false,
2290
+ "normalization_type": "percentile"
2291
+ },
2292
+ {
2293
+ "rep": "ABSOLUTE",
2294
+ "type": "NON_EEF",
2295
+ "format": "DEFAULT",
2296
+ "state_key": null,
2297
+ "input_rotation_format": "quat",
2298
+ "input_quat_order": "xyzw",
2299
+ "reference_rotation_format": "rot6d",
2300
+ "reference_quat_order": "xyzw",
2301
+ "translation_scaling_key": null,
2302
+ "rotation_scaling_key": null,
2303
+ "hold_through_clutch": false,
2304
+ "normalization_type": "percentile"
2305
+ }
2306
+ ]
2307
+ },
2308
+ "language": {
2309
+ "delta_indices": [
2310
+ 0
2311
+ ],
2312
+ "modality_keys": [
2313
+ "annotation.instruction"
2314
+ ],
2315
+ "sin_cos_embedding_keys": null,
2316
+ "mean_std_embedding_keys": null,
2317
+ "min_max_embedding_keys": null,
2318
+ "pass_through_keys": null,
2319
+ "action_configs": null
2320
+ }
2321
+ },
2322
+ "hamlyn_dvrk_30hz": {
2323
+ "video": {
2324
+ "delta_indices": [
2325
+ 0
2326
+ ],
2327
+ "modality_keys": [
2328
+ "endoscope",
2329
+ "wrist_left",
2330
+ "wrist_right"
2331
+ ],
2332
+ "sin_cos_embedding_keys": null,
2333
+ "mean_std_embedding_keys": null,
2334
+ "min_max_embedding_keys": null,
2335
+ "pass_through_keys": null,
2336
+ "action_configs": null
2337
+ },
2338
+ "state": {
2339
+ "delta_indices": [
2340
+ 0
2341
+ ],
2342
+ "modality_keys": [
2343
+ "left_arm_pose",
2344
+ "left_arm_gripper",
2345
+ "right_arm_pose",
2346
+ "right_arm_gripper"
2347
+ ],
2348
+ "sin_cos_embedding_keys": null,
2349
+ "mean_std_embedding_keys": [
2350
+ "left_arm_pose",
2351
+ "left_arm_gripper",
2352
+ "right_arm_pose",
2353
+ "right_arm_gripper"
2354
+ ],
2355
+ "min_max_embedding_keys": null,
2356
+ "pass_through_keys": null,
2357
+ "action_configs": null
2358
+ },
2359
+ "action": {
2360
+ "delta_indices": [
2361
+ 0,
2362
+ 1,
2363
+ 2,
2364
+ 3,
2365
+ 4,
2366
+ 5,
2367
+ 6,
2368
+ 7,
2369
+ 8,
2370
+ 9,
2371
+ 10,
2372
+ 11,
2373
+ 12,
2374
+ 13,
2375
+ 14,
2376
+ 15,
2377
+ 16,
2378
+ 17,
2379
+ 18,
2380
+ 19,
2381
+ 20,
2382
+ 21,
2383
+ 22,
2384
+ 23,
2385
+ 24,
2386
+ 25,
2387
+ 26,
2388
+ 27,
2389
+ 28,
2390
+ 29,
2391
+ 30,
2392
+ 31,
2393
+ 32,
2394
+ 33,
2395
+ 34,
2396
+ 35,
2397
+ 36,
2398
+ 37,
2399
+ 38,
2400
+ 39,
2401
+ 40,
2402
+ 41,
2403
+ 42,
2404
+ 43,
2405
+ 44,
2406
+ 45,
2407
+ 46,
2408
+ 47,
2409
+ 48,
2410
+ 49
2411
+ ],
2412
+ "modality_keys": [
2413
+ "left_arm_pose",
2414
+ "left_arm_gripper",
2415
+ "right_arm_pose",
2416
+ "right_arm_gripper"
2417
+ ],
2418
+ "sin_cos_embedding_keys": null,
2419
+ "mean_std_embedding_keys": null,
2420
+ "min_max_embedding_keys": null,
2421
+ "pass_through_keys": null,
2422
+ "action_configs": [
2423
+ {
2424
+ "rep": "REL_XYZ_ROT6D",
2425
+ "type": "EEF",
2426
+ "format": "XYZ_ROT6D",
2427
+ "state_key": "left_arm_pose",
2428
+ "input_rotation_format": "quat",
2429
+ "input_quat_order": "wxyz",
2430
+ "reference_rotation_format": "quat",
2431
+ "reference_quat_order": "wxyz",
2432
+ "translation_scaling_key": null,
2433
+ "rotation_scaling_key": null,
2434
+ "hold_through_clutch": false,
2435
+ "normalization_type": "percentile"
2436
+ },
2437
+ {
2438
+ "rep": "ABSOLUTE",
2439
+ "type": "NON_EEF",
2440
+ "format": "DEFAULT",
2441
+ "state_key": null,
2442
+ "input_rotation_format": "quat",
2443
+ "input_quat_order": "xyzw",
2444
+ "reference_rotation_format": "rot6d",
2445
+ "reference_quat_order": "xyzw",
2446
+ "translation_scaling_key": null,
2447
+ "rotation_scaling_key": null,
2448
+ "hold_through_clutch": false,
2449
+ "normalization_type": "percentile"
2450
+ },
2451
+ {
2452
+ "rep": "REL_XYZ_ROT6D",
2453
+ "type": "EEF",
2454
+ "format": "XYZ_ROT6D",
2455
+ "state_key": "right_arm_pose",
2456
+ "input_rotation_format": "quat",
2457
+ "input_quat_order": "wxyz",
2458
+ "reference_rotation_format": "quat",
2459
+ "reference_quat_order": "wxyz",
2460
+ "translation_scaling_key": null,
2461
+ "rotation_scaling_key": null,
2462
+ "hold_through_clutch": false,
2463
+ "normalization_type": "percentile"
2464
+ },
2465
+ {
2466
+ "rep": "ABSOLUTE",
2467
+ "type": "NON_EEF",
2468
+ "format": "DEFAULT",
2469
+ "state_key": null,
2470
+ "input_rotation_format": "quat",
2471
+ "input_quat_order": "xyzw",
2472
+ "reference_rotation_format": "rot6d",
2473
+ "reference_quat_order": "xyzw",
2474
+ "translation_scaling_key": null,
2475
+ "rotation_scaling_key": null,
2476
+ "hold_through_clutch": false,
2477
+ "normalization_type": "percentile"
2478
+ }
2479
+ ]
2480
+ },
2481
+ "language": {
2482
+ "delta_indices": [
2483
+ 0
2484
+ ],
2485
+ "modality_keys": [
2486
+ "task"
2487
+ ],
2488
+ "sin_cos_embedding_keys": null,
2489
+ "mean_std_embedding_keys": null,
2490
+ "min_max_embedding_keys": null,
2491
+ "pass_through_keys": null,
2492
+ "action_configs": null
2493
+ }
2494
+ },
2495
+ "ucb_dvrk": {
2496
+ "video": {
2497
+ "delta_indices": [
2498
+ 0
2499
+ ],
2500
+ "modality_keys": [
2501
+ "camera_left"
2502
+ ],
2503
+ "sin_cos_embedding_keys": null,
2504
+ "mean_std_embedding_keys": null,
2505
+ "min_max_embedding_keys": null,
2506
+ "pass_through_keys": null,
2507
+ "action_configs": null
2508
+ },
2509
+ "state": {
2510
+ "delta_indices": [
2511
+ 0
2512
+ ],
2513
+ "modality_keys": [
2514
+ "psm1_joints",
2515
+ "psm1_gripper",
2516
+ "psm2_joints",
2517
+ "psm2_gripper",
2518
+ "psm1_pose",
2519
+ "psm2_pose"
2520
+ ],
2521
+ "sin_cos_embedding_keys": null,
2522
+ "mean_std_embedding_keys": [
2523
+ "psm1_joints",
2524
+ "psm1_gripper",
2525
+ "psm2_joints",
2526
+ "psm2_gripper"
2527
+ ],
2528
+ "min_max_embedding_keys": null,
2529
+ "pass_through_keys": [
2530
+ "psm1_pose",
2531
+ "psm2_pose"
2532
+ ],
2533
+ "action_configs": null
2534
+ },
2535
+ "action": {
2536
+ "delta_indices": [
2537
+ 0,
2538
+ 1,
2539
+ 2,
2540
+ 3,
2541
+ 4,
2542
+ 5,
2543
+ 6,
2544
+ 7,
2545
+ 8,
2546
+ 9,
2547
+ 10,
2548
+ 11,
2549
+ 12,
2550
+ 13,
2551
+ 14,
2552
+ 15,
2553
+ 16,
2554
+ 17,
2555
+ 18,
2556
+ 19,
2557
+ 20,
2558
+ 21,
2559
+ 22,
2560
+ 23,
2561
+ 24,
2562
+ 25,
2563
+ 26,
2564
+ 27,
2565
+ 28,
2566
+ 29,
2567
+ 30,
2568
+ 31,
2569
+ 32,
2570
+ 33,
2571
+ 34,
2572
+ 35,
2573
+ 36,
2574
+ 37,
2575
+ 38,
2576
+ 39,
2577
+ 40,
2578
+ 41,
2579
+ 42,
2580
+ 43,
2581
+ 44,
2582
+ 45,
2583
+ 46,
2584
+ 47,
2585
+ 48,
2586
+ 49
2587
+ ],
2588
+ "modality_keys": [
2589
+ "psm1_pose",
2590
+ "psm1_gripper",
2591
+ "psm2_pose",
2592
+ "psm2_gripper"
2593
+ ],
2594
+ "sin_cos_embedding_keys": null,
2595
+ "mean_std_embedding_keys": null,
2596
+ "min_max_embedding_keys": null,
2597
+ "pass_through_keys": null,
2598
+ "action_configs": [
2599
+ {
2600
+ "rep": "REL_XYZ_ROT6D",
2601
+ "type": "EEF",
2602
+ "format": "XYZ_ROT6D",
2603
+ "state_key": "psm1_pose",
2604
+ "input_rotation_format": "quat",
2605
+ "input_quat_order": "xyzw",
2606
+ "reference_rotation_format": "quat",
2607
+ "reference_quat_order": "xyzw",
2608
+ "translation_scaling_key": null,
2609
+ "rotation_scaling_key": null,
2610
+ "hold_through_clutch": false,
2611
+ "normalization_type": "percentile"
2612
+ },
2613
+ {
2614
+ "rep": "ABSOLUTE",
2615
+ "type": "NON_EEF",
2616
+ "format": "DEFAULT",
2617
+ "state_key": null,
2618
+ "input_rotation_format": "quat",
2619
+ "input_quat_order": "xyzw",
2620
+ "reference_rotation_format": "rot6d",
2621
+ "reference_quat_order": "xyzw",
2622
+ "translation_scaling_key": null,
2623
+ "rotation_scaling_key": null,
2624
+ "hold_through_clutch": false,
2625
+ "normalization_type": "percentile"
2626
+ },
2627
+ {
2628
+ "rep": "REL_XYZ_ROT6D",
2629
+ "type": "EEF",
2630
+ "format": "XYZ_ROT6D",
2631
+ "state_key": "psm2_pose",
2632
+ "input_rotation_format": "quat",
2633
+ "input_quat_order": "xyzw",
2634
+ "reference_rotation_format": "quat",
2635
+ "reference_quat_order": "xyzw",
2636
+ "translation_scaling_key": null,
2637
+ "rotation_scaling_key": null,
2638
+ "hold_through_clutch": false,
2639
+ "normalization_type": "percentile"
2640
+ },
2641
+ {
2642
+ "rep": "ABSOLUTE",
2643
+ "type": "NON_EEF",
2644
+ "format": "DEFAULT",
2645
+ "state_key": null,
2646
+ "input_rotation_format": "quat",
2647
+ "input_quat_order": "xyzw",
2648
+ "reference_rotation_format": "rot6d",
2649
+ "reference_quat_order": "xyzw",
2650
+ "translation_scaling_key": null,
2651
+ "rotation_scaling_key": null,
2652
+ "hold_through_clutch": false,
2653
+ "normalization_type": "percentile"
2654
+ }
2655
+ ]
2656
+ },
2657
+ "language": {
2658
+ "delta_indices": [
2659
+ 0
2660
+ ],
2661
+ "modality_keys": [
2662
+ "task"
2663
+ ],
2664
+ "sin_cos_embedding_keys": null,
2665
+ "mean_std_embedding_keys": null,
2666
+ "min_max_embedding_keys": null,
2667
+ "pass_through_keys": null,
2668
+ "action_configs": null
2669
+ }
2670
+ },
2671
+ "jhu_imerse_star_il": {
2672
+ "video": {
2673
+ "delta_indices": [
2674
+ 0
2675
+ ],
2676
+ "modality_keys": [
2677
+ "endoscope_left",
2678
+ "wrist_left"
2679
+ ],
2680
+ "sin_cos_embedding_keys": null,
2681
+ "mean_std_embedding_keys": null,
2682
+ "min_max_embedding_keys": null,
2683
+ "pass_through_keys": null,
2684
+ "action_configs": null
2685
+ },
2686
+ "state": {
2687
+ "delta_indices": [
2688
+ 0
2689
+ ],
2690
+ "modality_keys": [
2691
+ "kuka_joint_pos",
2692
+ "endo360_joint_pos",
2693
+ "kuka_pose"
2694
+ ],
2695
+ "sin_cos_embedding_keys": null,
2696
+ "mean_std_embedding_keys": [
2697
+ "kuka_joint_pos",
2698
+ "endo360_joint_pos"
2699
+ ],
2700
+ "min_max_embedding_keys": null,
2701
+ "pass_through_keys": [
2702
+ "kuka_pose"
2703
+ ],
2704
+ "action_configs": null
2705
+ },
2706
+ "action": {
2707
+ "delta_indices": [
2708
+ 1,
2709
+ 2,
2710
+ 3,
2711
+ 4,
2712
+ 5,
2713
+ 6,
2714
+ 7,
2715
+ 8,
2716
+ 9,
2717
+ 10,
2718
+ 11,
2719
+ 12,
2720
+ 13,
2721
+ 14,
2722
+ 15,
2723
+ 16,
2724
+ 17,
2725
+ 18,
2726
+ 19,
2727
+ 20,
2728
+ 21,
2729
+ 22,
2730
+ 23,
2731
+ 24,
2732
+ 25,
2733
+ 26,
2734
+ 27,
2735
+ 28,
2736
+ 29,
2737
+ 30,
2738
+ 31,
2739
+ 32,
2740
+ 33,
2741
+ 34,
2742
+ 35,
2743
+ 36,
2744
+ 37,
2745
+ 38,
2746
+ 39,
2747
+ 40,
2748
+ 41,
2749
+ 42,
2750
+ 43,
2751
+ 44,
2752
+ 45,
2753
+ 46,
2754
+ 47,
2755
+ 48,
2756
+ 49,
2757
+ 50
2758
+ ],
2759
+ "modality_keys": [
2760
+ "kuka_pose"
2761
+ ],
2762
+ "sin_cos_embedding_keys": null,
2763
+ "mean_std_embedding_keys": null,
2764
+ "min_max_embedding_keys": null,
2765
+ "pass_through_keys": null,
2766
+ "action_configs": [
2767
+ {
2768
+ "rep": "REL_XYZ_ROT6D",
2769
+ "type": "EEF",
2770
+ "format": "XYZ_ROT6D",
2771
+ "state_key": "kuka_pose",
2772
+ "input_rotation_format": "quat",
2773
+ "input_quat_order": "xyzw",
2774
+ "reference_rotation_format": "quat",
2775
+ "reference_quat_order": "xyzw",
2776
+ "translation_scaling_key": null,
2777
+ "rotation_scaling_key": null,
2778
+ "hold_through_clutch": false,
2779
+ "normalization_type": "percentile"
2780
+ }
2781
+ ]
2782
+ },
2783
+ "language": {
2784
+ "delta_indices": [
2785
+ 0
2786
+ ],
2787
+ "modality_keys": [
2788
+ "annotation.human.task_description"
2789
+ ],
2790
+ "sin_cos_embedding_keys": null,
2791
+ "mean_std_embedding_keys": null,
2792
+ "min_max_embedding_keys": null,
2793
+ "pass_through_keys": null,
2794
+ "action_configs": null
2795
+ }
2796
+ }
2797
+ },
2798
+ "image_crop_size": null,
2799
+ "image_target_size": null,
2800
+ "use_albumentations": true,
2801
+ "random_rotation_angle": null,
2802
+ "color_jitter_params": null,
2803
+ "shortest_image_edge": 256,
2804
+ "crop_fraction": 0.95,
2805
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
2806
+ "model_type": "eagle",
2807
+ "formalize_language": true,
2808
+ "max_state_dim": 128,
2809
+ "max_action_dim": 128,
2810
+ "max_action_horizon": 50,
2811
+ "use_percentiles": false,
2812
+ "clip_outliers": true,
2813
+ "apply_sincos_state_encoding": true,
2814
+ "use_relative_action": true
2815
+ }
2816
+ }
checkpoint-100000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3a90f89949eefdc070ca017ba5639ef3c048e0adc66a9ebabe8f98f3097ff56
3
+ size 14645
checkpoint-100000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0adcd884f504218477ba4e8d6c172d1621334843f166d925140146f37d9592b
3
+ size 1465
checkpoint-100000/statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46d07ebf9c23abaad7dedb34e5f24bcf194b2e63cd9836ca05f6a3b638164d87
3
+ size 5841
checkpoint-100000/wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d6", "run_id": "groot_finetune_v2"}
checkpoint-80000/config.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": true,
5
+ "architectures": [
6
+ "Gr00tN1d6"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_model_type": "eagle",
12
+ "backbone_trainable_params_fp32": true,
13
+ "collator_overwrite_image_inputs": false,
14
+ "color_jitter_params": {
15
+ "brightness": 0.1,
16
+ "contrast": 0.1,
17
+ "hue": 0.1,
18
+ "saturation": 0.1
19
+ },
20
+ "crop_fraction": 0.95,
21
+ "diffusion_model_cfg": {
22
+ "attention_head_dim": 48,
23
+ "dropout": 0.2,
24
+ "final_dropout": true,
25
+ "interleave_self_attention": true,
26
+ "norm_type": "ada_norm",
27
+ "num_attention_heads": 32,
28
+ "num_layers": 32,
29
+ "output_dim": 1024,
30
+ "positional_embeddings": null
31
+ },
32
+ "eagle_collator": true,
33
+ "formalize_language": true,
34
+ "gemma_collator": false,
35
+ "hidden_size": 1024,
36
+ "image_crop_size": null,
37
+ "image_target_size": null,
38
+ "input_embedding_dim": 1536,
39
+ "load_bf16": true,
40
+ "max_action_dim": 128,
41
+ "max_num_embodiments": 32,
42
+ "max_seq_len": 1024,
43
+ "max_state_dim": 128,
44
+ "model_dtype": "bfloat16",
45
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
+ "model_type": "Gr00tN1d6",
47
+ "noise_beta_alpha": 1.5,
48
+ "noise_beta_beta": 1.0,
49
+ "noise_s": 0.999,
50
+ "num_inference_timesteps": 4,
51
+ "num_timestep_buckets": 1000,
52
+ "random_rotation_angle": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 16,
55
+ "shortest_image_edge": 256,
56
+ "soft_prompt_lr_scale": 1.0,
57
+ "soft_prompt_num_tokens": 32,
58
+ "state_dropout_prob": 0.0,
59
+ "state_dropout_prob_per_embodiment": {
60
+ "cmr_versius": 1.0,
61
+ "hamlyn_dvrk_30hz": 1.0,
62
+ "jhu_imerse_dvrk": 1.0,
63
+ "jhu_imerse_dvrk_mono": 1.0,
64
+ "jhu_imerse_star_il": 1.0,
65
+ "jhu_lscr_dvrk_smarts": 1.0,
66
+ "obuda_dvrk": 1.0,
67
+ "rob_surgical_bitrack": 1.0,
68
+ "stanford_dvrk_real": 1.0,
69
+ "tud_tundra_ur5e": 1.0,
70
+ "turin_mitic_ex_vivo": 1.0,
71
+ "ucb_dvrk": 1.0,
72
+ "ucsd_dvrk": 1.0,
73
+ "ustc_torin_tuodao": 1.0
74
+ },
75
+ "torch_dtype": "bfloat16",
76
+ "transformers_version": "4.51.3",
77
+ "tune_diffusion_model": true,
78
+ "tune_llm": false,
79
+ "tune_projector": true,
80
+ "tune_top_llm_layers": 4,
81
+ "tune_visual": false,
82
+ "tune_vlln": true,
83
+ "use_albumentations_transforms": true,
84
+ "use_alternate_vl_dit": true,
85
+ "use_flash_attention": true,
86
+ "use_relative_action": true,
87
+ "use_soft_prompts": false,
88
+ "use_vlln": true
89
+ }
checkpoint-80000/embodiment_id.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "new_embodiment": 10,
10
+ "jhu_imerse_dvrk": 3,
11
+ "cmr_versius": 4,
12
+ "ucb_dvrk": 5,
13
+ "sanoscience_sim": 6,
14
+ "tum_sonata_franka": 7,
15
+ "hamlyn_dvrk_15hz": 9,
16
+ "hamlyn_dvrk_30hz": 11,
17
+ "ustc_torin_tuodao": 12,
18
+ "ucsd_dvrk": 14,
19
+ "jhu_imerse_dvrk_mono": 15,
20
+ "rob_surgical_bitrack": 16,
21
+ "stanford_dvrk_real": 17,
22
+ "obuda_dvrk": 18,
23
+ "polyu_sim": 19,
24
+ "moon_maestro": 21,
25
+ "jhu_lscr_dvrk_miracle": 22,
26
+ "jhu_lscr_dvrk_smarts": 23,
27
+ "jhu_imerse_star_il": 27,
28
+ "tud_tundra_ur5e": 25,
29
+ "turin_mitic_ex_vivo": 26,
30
+ "oxe_droid": 29
31
+ }
checkpoint-80000/experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d6
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Eagle-Block2A-2B-v2
6
+ backbone_model_type: eagle
7
+ model_revision: null
8
+ tune_top_llm_layers: 4
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 16
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ collator_overwrite_image_inputs: false
17
+ eagle_collator: true
18
+ backbone_trainable_params_fp32: true
19
+ image_crop_size: null
20
+ image_target_size: null
21
+ shortest_image_edge: 256
22
+ crop_fraction: 0.95
23
+ random_rotation_angle: null
24
+ color_jitter_params: null
25
+ use_albumentations_transforms: true
26
+ extra_augmentation_config: null
27
+ formalize_language: true
28
+ apply_sincos_state_encoding: false
29
+ use_relative_action: true
30
+ max_state_dim: 29
31
+ max_action_dim: 29
32
+ action_horizon: 50
33
+ hidden_size: 1024
34
+ input_embedding_dim: 1536
35
+ add_pos_embed: true
36
+ attn_dropout: 0.2
37
+ use_vlln: true
38
+ max_seq_len: 1024
39
+ use_alternate_vl_dit: true
40
+ attend_text_every_n_blocks: 2
41
+ diffusion_model_cfg:
42
+ positional_embeddings: null
43
+ num_layers: 32
44
+ num_attention_heads: 32
45
+ attention_head_dim: 48
46
+ norm_type: ada_norm
47
+ dropout: 0.2
48
+ final_dropout: true
49
+ output_dim: 1024
50
+ interleave_self_attention: true
51
+ num_inference_timesteps: 4
52
+ noise_beta_alpha: 1.5
53
+ noise_beta_beta: 1.0
54
+ noise_s: 0.999
55
+ num_timestep_buckets: 1000
56
+ tune_projector: true
57
+ tune_diffusion_model: true
58
+ tune_vlln: true
59
+ state_dropout_prob: 0.0
60
+ state_dropout_prob_per_embodiment: null
61
+ state_additive_noise_scale: 0.0
62
+ max_num_embodiments: 32
63
+ data:
64
+ datasets:
65
+ - dataset_paths:
66
+ - /hkfs/work/workspace/scratch/uenyr-thesis/data/peg_transfer_lerobot
67
+ embodiment_tag: jhu_imerse_dvrk_mono
68
+ mix_ratio: 1.0
69
+ dataset_type: physical_embodiment
70
+ val_dataset_path: null
71
+ exclude_splits: null
72
+ include_splits: null
73
+ modality_configs:
74
+ jhu_imerse_dvrk_mono:
75
+ video:
76
+ delta_indices:
77
+ - 0
78
+ modality_keys:
79
+ - endoscope_left
80
+ sin_cos_embedding_keys: null
81
+ mean_std_embedding_keys: null
82
+ min_max_embedding_keys: null
83
+ pass_through_keys: null
84
+ action_configs: null
85
+ state:
86
+ delta_indices:
87
+ - 0
88
+ modality_keys:
89
+ - psm1_pose
90
+ - psm1_gripper
91
+ - psm2_pose
92
+ - psm2_gripper
93
+ sin_cos_embedding_keys: null
94
+ mean_std_embedding_keys:
95
+ - psm1_pose
96
+ - psm1_gripper
97
+ - psm2_pose
98
+ - psm2_gripper
99
+ min_max_embedding_keys: null
100
+ pass_through_keys: null
101
+ action_configs: null
102
+ action:
103
+ delta_indices:
104
+ - 0
105
+ - 1
106
+ - 2
107
+ - 3
108
+ - 4
109
+ - 5
110
+ - 6
111
+ - 7
112
+ - 8
113
+ - 9
114
+ - 10
115
+ - 11
116
+ - 12
117
+ - 13
118
+ - 14
119
+ - 15
120
+ - 16
121
+ - 17
122
+ - 18
123
+ - 19
124
+ - 20
125
+ - 21
126
+ - 22
127
+ - 23
128
+ - 24
129
+ - 25
130
+ - 26
131
+ - 27
132
+ - 28
133
+ - 29
134
+ - 30
135
+ - 31
136
+ - 32
137
+ - 33
138
+ - 34
139
+ - 35
140
+ - 36
141
+ - 37
142
+ - 38
143
+ - 39
144
+ - 40
145
+ - 41
146
+ - 42
147
+ - 43
148
+ - 44
149
+ - 45
150
+ - 46
151
+ - 47
152
+ - 48
153
+ - 49
154
+ modality_keys:
155
+ - psm1_pose
156
+ - psm1_gripper
157
+ - psm2_pose
158
+ - psm2_gripper
159
+ sin_cos_embedding_keys: null
160
+ mean_std_embedding_keys: null
161
+ min_max_embedding_keys: null
162
+ pass_through_keys: null
163
+ action_configs:
164
+ - rep: REL_XYZ_ROT6D
165
+ type: EEF
166
+ format: XYZ_ROT6D
167
+ state_key: psm1_pose
168
+ input_rotation_format: quat
169
+ input_quat_order: xyzw
170
+ reference_rotation_format: quat
171
+ reference_quat_order: xyzw
172
+ translation_scaling_key: null
173
+ rotation_scaling_key: null
174
+ hold_through_clutch: false
175
+ normalization_type: temporal_meanstd
176
+ - rep: ABSOLUTE
177
+ type: NON_EEF
178
+ format: DEFAULT
179
+ state_key: null
180
+ input_rotation_format: quat
181
+ input_quat_order: xyzw
182
+ reference_rotation_format: rot6d
183
+ reference_quat_order: xyzw
184
+ translation_scaling_key: null
185
+ rotation_scaling_key: null
186
+ hold_through_clutch: false
187
+ normalization_type: temporal_meanstd
188
+ - rep: REL_XYZ_ROT6D
189
+ type: EEF
190
+ format: XYZ_ROT6D
191
+ state_key: psm2_pose
192
+ input_rotation_format: quat
193
+ input_quat_order: xyzw
194
+ reference_rotation_format: quat
195
+ reference_quat_order: xyzw
196
+ translation_scaling_key: null
197
+ rotation_scaling_key: null
198
+ hold_through_clutch: false
199
+ normalization_type: temporal_meanstd
200
+ - rep: ABSOLUTE
201
+ type: NON_EEF
202
+ format: DEFAULT
203
+ state_key: null
204
+ input_rotation_format: quat
205
+ input_quat_order: xyzw
206
+ reference_rotation_format: rot6d
207
+ reference_quat_order: xyzw
208
+ translation_scaling_key: null
209
+ rotation_scaling_key: null
210
+ hold_through_clutch: false
211
+ normalization_type: temporal_meanstd
212
+ language:
213
+ delta_indices:
214
+ - 0
215
+ modality_keys:
216
+ - annotation.human.task_description
217
+ sin_cos_embedding_keys: null
218
+ mean_std_embedding_keys: null
219
+ min_max_embedding_keys: null
220
+ pass_through_keys: null
221
+ action_configs: null
222
+ download_cache: false
223
+ shard_size: 1024
224
+ episode_sampling_rate: 0.1
225
+ num_shards_per_epoch: 100000
226
+ override_pretraining_statistics: true
227
+ mode: single_turn
228
+ random_chop: 0.0
229
+ mock_dataset_mode: false
230
+ shuffle: true
231
+ seed: 42
232
+ multiprocessing_context: fork
233
+ allow_padding: false
234
+ subsample_ratio: 1.0
235
+ image_crop_size:
236
+ - 244
237
+ - 244
238
+ image_target_size:
239
+ - 224
240
+ - 224
241
+ video_backend: torchcodec
242
+ training:
243
+ output_dir: /hkfs/work/workspace/scratch/uenyr-thesis/outputs/groot_finetune_v2
244
+ experiment_name: null
245
+ max_steps: 100000
246
+ global_batch_size: 8
247
+ batch_size: null
248
+ gradient_accumulation_steps: 1
249
+ learning_rate: 0.0001
250
+ lr_scheduler_type: cosine
251
+ weight_decay: 1.0e-05
252
+ warmup_ratio: 0.05
253
+ warmup_steps: 0
254
+ max_grad_norm: 1.0
255
+ optim: adamw_torch
256
+ start_from_checkpoint: /hkfs/work/workspace/scratch/uenyr-thesis/pretrained/GR00T-H
257
+ tf32: true
258
+ fp16: false
259
+ bf16: true
260
+ eval_bf16: true
261
+ logging_steps: 10
262
+ save_steps: 10000
263
+ save_total_limit: 5
264
+ save_vl_model: false
265
+ upload_checkpoints: false
266
+ upload_every: 1000
267
+ upload_last_n_checkpoints: 5
268
+ max_concurrent_uploads: 2
269
+ eval_strategy: 'no'
270
+ eval_steps: 500
271
+ eval_set_split_ratio: 0.1
272
+ eval_batch_size: 2
273
+ save_best_eval_metric_name: ''
274
+ save_best_eval_metric_greater_is_better: true
275
+ deepspeed_stage: 2
276
+ gradient_checkpointing: false
277
+ transformers_trust_remote_code: true
278
+ transformers_local_files_only: false
279
+ transformers_cache_dir: null
280
+ transformers_access_token: null
281
+ use_ddp: false
282
+ ddp_bucket_cap_mb: 100
283
+ num_gpus: 1
284
+ dataloader_num_workers: 4
285
+ remove_unused_columns: false
286
+ use_wandb: false
287
+ wandb_project: finetune-gr00t-n1d6
288
+ enable_profiling: false
289
+ max_retries: 3
290
+ assert_loss_less_than: null
291
+ add_rl_callback: false
292
+ enable_open_loop_eval: false
293
+ open_loop_eval_traj_ids:
294
+ - 0
295
+ open_loop_eval_steps_per_traj: 100
296
+ open_loop_eval_plot_indices: null
297
+ max_steps: 100000
298
+ save_steps: 10000
checkpoint-80000/experiment_cfg/config.yaml ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - /hkfs/work/workspace/scratch/uenyr-thesis/data/peg_transfer_lerobot
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: jhu_imerse_dvrk_mono
10
+ exclude_splits: null
11
+ include_splits: null
12
+ mix_ratio: 1.0
13
+ val_dataset_path: null
14
+ download_cache: false
15
+ episode_sampling_rate: 0.1
16
+ image_crop_size:
17
+ - 244
18
+ - 244
19
+ image_target_size:
20
+ - 224
21
+ - 224
22
+ mock_dataset_mode: false
23
+ modality_configs:
24
+ jhu_imerse_dvrk_mono:
25
+ action: !!python/object:gr00t.data.types.ModalityConfig
26
+ action_configs:
27
+ - !!python/object:gr00t.data.types.ActionConfig
28
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
29
+ - xyz+rot6d
30
+ hold_through_clutch: false
31
+ input_quat_order: xyzw
32
+ input_rotation_format: quat
33
+ normalization_type: temporal_meanstd
34
+ reference_quat_order: xyzw
35
+ reference_rotation_format: quat
36
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
37
+ - rel_xyz_rot6d
38
+ rotation_scaling_key: null
39
+ state_key: psm1_pose
40
+ translation_scaling_key: null
41
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
42
+ - eef
43
+ - !!python/object:gr00t.data.types.ActionConfig
44
+ format: &id004 !!python/object/apply:gr00t.data.types.ActionFormat
45
+ - default
46
+ hold_through_clutch: false
47
+ input_quat_order: xyzw
48
+ input_rotation_format: quat
49
+ normalization_type: temporal_meanstd
50
+ reference_quat_order: xyzw
51
+ reference_rotation_format: rot6d
52
+ rep: &id005 !!python/object/apply:gr00t.data.types.ActionRepresentation
53
+ - absolute
54
+ rotation_scaling_key: null
55
+ state_key: null
56
+ translation_scaling_key: null
57
+ type: &id006 !!python/object/apply:gr00t.data.types.ActionType
58
+ - non_eef
59
+ - !!python/object:gr00t.data.types.ActionConfig
60
+ format: *id001
61
+ hold_through_clutch: false
62
+ input_quat_order: xyzw
63
+ input_rotation_format: quat
64
+ normalization_type: temporal_meanstd
65
+ reference_quat_order: xyzw
66
+ reference_rotation_format: quat
67
+ rep: *id002
68
+ rotation_scaling_key: null
69
+ state_key: psm2_pose
70
+ translation_scaling_key: null
71
+ type: *id003
72
+ - !!python/object:gr00t.data.types.ActionConfig
73
+ format: *id004
74
+ hold_through_clutch: false
75
+ input_quat_order: xyzw
76
+ input_rotation_format: quat
77
+ normalization_type: temporal_meanstd
78
+ reference_quat_order: xyzw
79
+ reference_rotation_format: rot6d
80
+ rep: *id005
81
+ rotation_scaling_key: null
82
+ state_key: null
83
+ translation_scaling_key: null
84
+ type: *id006
85
+ delta_indices:
86
+ - 0
87
+ - 1
88
+ - 2
89
+ - 3
90
+ - 4
91
+ - 5
92
+ - 6
93
+ - 7
94
+ - 8
95
+ - 9
96
+ - 10
97
+ - 11
98
+ - 12
99
+ - 13
100
+ - 14
101
+ - 15
102
+ - 16
103
+ - 17
104
+ - 18
105
+ - 19
106
+ - 20
107
+ - 21
108
+ - 22
109
+ - 23
110
+ - 24
111
+ - 25
112
+ - 26
113
+ - 27
114
+ - 28
115
+ - 29
116
+ - 30
117
+ - 31
118
+ - 32
119
+ - 33
120
+ - 34
121
+ - 35
122
+ - 36
123
+ - 37
124
+ - 38
125
+ - 39
126
+ - 40
127
+ - 41
128
+ - 42
129
+ - 43
130
+ - 44
131
+ - 45
132
+ - 46
133
+ - 47
134
+ - 48
135
+ - 49
136
+ mean_std_embedding_keys: null
137
+ min_max_embedding_keys: null
138
+ modality_keys:
139
+ - psm1_pose
140
+ - psm1_gripper
141
+ - psm2_pose
142
+ - psm2_gripper
143
+ pass_through_keys: null
144
+ sin_cos_embedding_keys: null
145
+ language: !!python/object:gr00t.data.types.ModalityConfig
146
+ action_configs: null
147
+ delta_indices:
148
+ - 0
149
+ mean_std_embedding_keys: null
150
+ min_max_embedding_keys: null
151
+ modality_keys:
152
+ - annotation.human.task_description
153
+ pass_through_keys: null
154
+ sin_cos_embedding_keys: null
155
+ state: !!python/object:gr00t.data.types.ModalityConfig
156
+ action_configs: null
157
+ delta_indices:
158
+ - 0
159
+ mean_std_embedding_keys:
160
+ - psm1_pose
161
+ - psm1_gripper
162
+ - psm2_pose
163
+ - psm2_gripper
164
+ min_max_embedding_keys: null
165
+ modality_keys:
166
+ - psm1_pose
167
+ - psm1_gripper
168
+ - psm2_pose
169
+ - psm2_gripper
170
+ pass_through_keys: null
171
+ sin_cos_embedding_keys: null
172
+ video: !!python/object:gr00t.data.types.ModalityConfig
173
+ action_configs: null
174
+ delta_indices:
175
+ - 0
176
+ mean_std_embedding_keys: null
177
+ min_max_embedding_keys: null
178
+ modality_keys:
179
+ - endoscope_left
180
+ pass_through_keys: null
181
+ sin_cos_embedding_keys: null
182
+ mode: single_turn
183
+ multiprocessing_context: fork
184
+ num_shards_per_epoch: 100000
185
+ override_pretraining_statistics: true
186
+ random_chop: 0.0
187
+ seed: 42
188
+ shard_size: 1024
189
+ shuffle: true
190
+ subsample_ratio: 1.0
191
+ video_backend: torchcodec
192
+ load_config_path: null
193
+ model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
194
+ _attn_implementation_autoset: false
195
+ _attn_implementation_internal: null
196
+ _commit_hash: null
197
+ _name_or_path: ''
198
+ add_cross_attention: false
199
+ architectures: null
200
+ backbone_model_type: eagle
201
+ backbone_trainable_params_fp32: true
202
+ bad_words_ids: null
203
+ begin_suppress_tokens: null
204
+ bos_token_id: null
205
+ chunk_size_feed_forward: 0
206
+ color_jitter_params: null
207
+ cross_attention_hidden_size: null
208
+ decoder_start_token_id: null
209
+ diffusion_model_cfg:
210
+ attention_head_dim: 48
211
+ dropout: 0.2
212
+ final_dropout: true
213
+ interleave_self_attention: true
214
+ norm_type: ada_norm
215
+ num_attention_heads: 32
216
+ num_layers: 32
217
+ output_dim: 1024
218
+ positional_embeddings: null
219
+ diversity_penalty: 0.0
220
+ do_sample: false
221
+ eagle_collator: true
222
+ early_stopping: false
223
+ encoder_no_repeat_ngram_size: 0
224
+ eos_token_id: null
225
+ exponential_decay_length_penalty: null
226
+ extra_augmentation_config: null
227
+ finetuning_task: null
228
+ forced_bos_token_id: null
229
+ forced_eos_token_id: null
230
+ id2label:
231
+ 0: LABEL_0
232
+ 1: LABEL_1
233
+ is_decoder: false
234
+ is_encoder_decoder: false
235
+ label2id:
236
+ LABEL_0: 0
237
+ LABEL_1: 1
238
+ length_penalty: 1.0
239
+ load_bf16: false
240
+ max_length: 20
241
+ min_length: 0
242
+ model_name: nvidia/Eagle-Block2A-2B-v2
243
+ no_repeat_ngram_size: 0
244
+ num_beam_groups: 1
245
+ num_beams: 1
246
+ num_return_sequences: 1
247
+ output_attentions: false
248
+ output_hidden_states: false
249
+ output_scores: false
250
+ pad_token_id: null
251
+ prefix: null
252
+ problem_type: null
253
+ pruned_heads: {}
254
+ random_rotation_angle: null
255
+ remove_invalid_values: false
256
+ repetition_penalty: 1.0
257
+ reproject_vision: false
258
+ return_dict: true
259
+ return_dict_in_generate: false
260
+ sep_token_id: null
261
+ state_dropout_prob: 0.0
262
+ state_dropout_prob_per_embodiment: null
263
+ suppress_tokens: null
264
+ task_specific_params: null
265
+ temperature: 1.0
266
+ tf_legacy_loss: false
267
+ tie_encoder_decoder: false
268
+ tie_word_embeddings: true
269
+ tokenizer_class: null
270
+ top_k: 50
271
+ top_p: 1.0
272
+ torch_dtype: null
273
+ torchscript: false
274
+ transformers_version: null
275
+ tune_diffusion_model: true
276
+ tune_llm: false
277
+ tune_projector: true
278
+ tune_visual: false
279
+ typical_p: 1.0
280
+ use_bfloat16: false
281
+ use_relative_action: true
282
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
283
+ add_rl_callback: false
284
+ assert_loss_less_than: null
285
+ batch_size: null
286
+ bf16: true
287
+ dataloader_num_workers: 4
288
+ ddp_bucket_cap_mb: 100
289
+ deepspeed_stage: 2
290
+ enable_open_loop_eval: false
291
+ enable_profiling: false
292
+ eval_batch_size: 2
293
+ eval_bf16: true
294
+ eval_set_split_ratio: 0.1
295
+ eval_steps: 500
296
+ eval_strategy: 'no'
297
+ experiment_name: null
298
+ fp16: false
299
+ global_batch_size: 8
300
+ gradient_accumulation_steps: 1
301
+ gradient_checkpointing: false
302
+ learning_rate: 0.0001
303
+ logging_steps: 10
304
+ lr_scheduler_type: cosine
305
+ max_concurrent_uploads: 2
306
+ max_grad_norm: 1.0
307
+ max_retries: 3
308
+ max_steps: 100000
309
+ num_gpus: 1
310
+ open_loop_eval_plot_indices: null
311
+ open_loop_eval_steps_per_traj: 100
312
+ open_loop_eval_traj_ids:
313
+ - 0
314
+ optim: adamw_torch
315
+ output_dir: /hkfs/work/workspace/scratch/uenyr-thesis/outputs/groot_finetune_v2
316
+ remove_unused_columns: false
317
+ save_best_eval_metric_greater_is_better: true
318
+ save_best_eval_metric_name: ''
319
+ save_steps: 10000
320
+ save_total_limit: 5
321
+ save_vl_model: false
322
+ start_from_checkpoint: /hkfs/work/workspace/scratch/uenyr-thesis/pretrained/GR00T-H
323
+ tf32: true
324
+ transformers_access_token: null
325
+ transformers_cache_dir: null
326
+ transformers_local_files_only: false
327
+ transformers_trust_remote_code: true
328
+ upload_checkpoints: false
329
+ upload_every: 1000
330
+ upload_last_n_checkpoints: 5
331
+ use_ddp: false
332
+ use_wandb: false
333
+ wandb_project: finetune-gr00t-n1d6
334
+ warmup_ratio: 0.05
335
+ warmup_steps: 0
336
+ weight_decay: 1.0e-05
checkpoint-80000/experiment_cfg/dataset_statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-80000/experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d6",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
+ "backbone_model_type": "eagle",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 4,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": true,
15
+ "collator_overwrite_image_inputs": false,
16
+ "eagle_collator": true,
17
+ "backbone_trainable_params_fp32": true,
18
+ "extra_augmentation_config": null,
19
+ "apply_sincos_state_encoding": true,
20
+ "use_relative_action": true,
21
+ "max_state_dim": 128,
22
+ "max_action_dim": 128,
23
+ "action_horizon": 50,
24
+ "hidden_size": 1024,
25
+ "input_embedding_dim": 1536,
26
+ "add_pos_embed": true,
27
+ "attn_dropout": 0.2,
28
+ "use_vlln": true,
29
+ "max_seq_len": 1024,
30
+ "use_alternate_vl_dit": true,
31
+ "attend_text_every_n_blocks": 2,
32
+ "diffusion_model_cfg": {
33
+ "attention_head_dim": 48,
34
+ "dropout": 0.2,
35
+ "final_dropout": true,
36
+ "interleave_self_attention": true,
37
+ "norm_type": "ada_norm",
38
+ "num_attention_heads": 32,
39
+ "num_layers": 32,
40
+ "output_dim": 1024,
41
+ "positional_embeddings": null
42
+ },
43
+ "num_inference_timesteps": 4,
44
+ "noise_beta_alpha": 1.5,
45
+ "noise_beta_beta": 1.0,
46
+ "noise_s": 0.999,
47
+ "num_timestep_buckets": 1000,
48
+ "tune_projector": true,
49
+ "tune_diffusion_model": true,
50
+ "tune_vlln": true,
51
+ "state_dropout_prob": 0.0,
52
+ "state_dropout_prob_per_embodiment": {
53
+ "cmr_versius": 1.0,
54
+ "jhu_imerse_dvrk": 1.0,
55
+ "obuda_dvrk": 1.0,
56
+ "stanford_dvrk_real": 1.0,
57
+ "ucb_dvrk": 1.0,
58
+ "ucsd_dvrk": 1.0,
59
+ "hamlyn_dvrk_30hz": 1.0,
60
+ "jhu_imerse_dvrk_mono": 1.0,
61
+ "jhu_imerse_star_il": 1.0,
62
+ "jhu_lscr_dvrk_smarts": 1.0,
63
+ "rob_surgical_bitrack": 1.0,
64
+ "tud_tundra_ur5e": 1.0,
65
+ "turin_mitic_ex_vivo": 1.0,
66
+ "ustc_torin_tuodao": 1.0
67
+ },
68
+ "state_additive_noise_scale": 0.0,
69
+ "max_num_embodiments": 32
70
+ }
checkpoint-80000/experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-80000/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c19de28e10df88541f5f55ac79279ee40927cbbe764658d0da525bedad82163
3
+ size 4990126640
checkpoint-80000/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28b9a9743718a8f48ddc119e8c4f3cecab8e02a0752379975bbe1c134a50920d
3
+ size 4823190320
checkpoint-80000/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-80000/processor_config.json ADDED
@@ -0,0 +1,2816 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "min_max_embedding_keys": null,
18
+ "pass_through_keys": null,
19
+ "action_configs": null
20
+ },
21
+ "state": {
22
+ "delta_indices": [
23
+ 0
24
+ ],
25
+ "modality_keys": [
26
+ "robot_pos",
27
+ "robot_ori_cos",
28
+ "robot_ori_sin",
29
+ "robot_2d_ori",
30
+ "robot_2d_ori_cos",
31
+ "robot_2d_ori_sin",
32
+ "robot_lin_vel",
33
+ "robot_ang_vel",
34
+ "arm_left_qpos",
35
+ "arm_left_qpos_sin",
36
+ "arm_left_qpos_cos",
37
+ "eef_left_pos",
38
+ "eef_left_quat",
39
+ "gripper_left_qpos",
40
+ "arm_right_qpos",
41
+ "arm_right_qpos_sin",
42
+ "arm_right_qpos_cos",
43
+ "eef_right_pos",
44
+ "eef_right_quat",
45
+ "gripper_right_qpos",
46
+ "trunk_qpos"
47
+ ],
48
+ "sin_cos_embedding_keys": null,
49
+ "mean_std_embedding_keys": null,
50
+ "min_max_embedding_keys": null,
51
+ "pass_through_keys": null,
52
+ "action_configs": null
53
+ },
54
+ "action": {
55
+ "delta_indices": [
56
+ 0,
57
+ 1,
58
+ 2,
59
+ 3,
60
+ 4,
61
+ 5,
62
+ 6,
63
+ 7,
64
+ 8,
65
+ 9,
66
+ 10,
67
+ 11,
68
+ 12,
69
+ 13,
70
+ 14,
71
+ 15,
72
+ 16,
73
+ 17,
74
+ 18,
75
+ 19,
76
+ 20,
77
+ 21,
78
+ 22,
79
+ 23,
80
+ 24,
81
+ 25,
82
+ 26,
83
+ 27,
84
+ 28,
85
+ 29,
86
+ 30,
87
+ 31
88
+ ],
89
+ "modality_keys": [
90
+ "base",
91
+ "torso",
92
+ "left_arm",
93
+ "left_gripper",
94
+ "right_arm",
95
+ "right_gripper"
96
+ ],
97
+ "sin_cos_embedding_keys": null,
98
+ "mean_std_embedding_keys": null,
99
+ "min_max_embedding_keys": null,
100
+ "pass_through_keys": null,
101
+ "action_configs": [
102
+ {
103
+ "rep": "ABSOLUTE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": null,
107
+ "input_rotation_format": "quat",
108
+ "input_quat_order": "xyzw",
109
+ "reference_rotation_format": "rot6d",
110
+ "reference_quat_order": "xyzw",
111
+ "translation_scaling_key": null,
112
+ "rotation_scaling_key": null,
113
+ "hold_through_clutch": false,
114
+ "normalization_type": "percentile"
115
+ },
116
+ {
117
+ "rep": "RELATIVE",
118
+ "type": "NON_EEF",
119
+ "format": "DEFAULT",
120
+ "state_key": "trunk_qpos",
121
+ "input_rotation_format": "quat",
122
+ "input_quat_order": "xyzw",
123
+ "reference_rotation_format": "rot6d",
124
+ "reference_quat_order": "xyzw",
125
+ "translation_scaling_key": null,
126
+ "rotation_scaling_key": null,
127
+ "hold_through_clutch": false,
128
+ "normalization_type": "percentile"
129
+ },
130
+ {
131
+ "rep": "RELATIVE",
132
+ "type": "NON_EEF",
133
+ "format": "DEFAULT",
134
+ "state_key": "arm_left_qpos",
135
+ "input_rotation_format": "quat",
136
+ "input_quat_order": "xyzw",
137
+ "reference_rotation_format": "rot6d",
138
+ "reference_quat_order": "xyzw",
139
+ "translation_scaling_key": null,
140
+ "rotation_scaling_key": null,
141
+ "hold_through_clutch": false,
142
+ "normalization_type": "percentile"
143
+ },
144
+ {
145
+ "rep": "ABSOLUTE",
146
+ "type": "NON_EEF",
147
+ "format": "DEFAULT",
148
+ "state_key": null,
149
+ "input_rotation_format": "quat",
150
+ "input_quat_order": "xyzw",
151
+ "reference_rotation_format": "rot6d",
152
+ "reference_quat_order": "xyzw",
153
+ "translation_scaling_key": null,
154
+ "rotation_scaling_key": null,
155
+ "hold_through_clutch": false,
156
+ "normalization_type": "percentile"
157
+ },
158
+ {
159
+ "rep": "RELATIVE",
160
+ "type": "NON_EEF",
161
+ "format": "DEFAULT",
162
+ "state_key": "arm_right_qpos",
163
+ "input_rotation_format": "quat",
164
+ "input_quat_order": "xyzw",
165
+ "reference_rotation_format": "rot6d",
166
+ "reference_quat_order": "xyzw",
167
+ "translation_scaling_key": null,
168
+ "rotation_scaling_key": null,
169
+ "hold_through_clutch": false,
170
+ "normalization_type": "percentile"
171
+ },
172
+ {
173
+ "rep": "ABSOLUTE",
174
+ "type": "NON_EEF",
175
+ "format": "DEFAULT",
176
+ "state_key": null,
177
+ "input_rotation_format": "quat",
178
+ "input_quat_order": "xyzw",
179
+ "reference_rotation_format": "rot6d",
180
+ "reference_quat_order": "xyzw",
181
+ "translation_scaling_key": null,
182
+ "rotation_scaling_key": null,
183
+ "hold_through_clutch": false,
184
+ "normalization_type": "percentile"
185
+ }
186
+ ]
187
+ },
188
+ "language": {
189
+ "delta_indices": [
190
+ 0
191
+ ],
192
+ "modality_keys": [
193
+ "annotation.human.coarse_action"
194
+ ],
195
+ "sin_cos_embedding_keys": null,
196
+ "mean_std_embedding_keys": null,
197
+ "min_max_embedding_keys": null,
198
+ "pass_through_keys": null,
199
+ "action_configs": null
200
+ }
201
+ },
202
+ "gr1": {
203
+ "video": {
204
+ "delta_indices": [
205
+ 0
206
+ ],
207
+ "modality_keys": [
208
+ "ego_view_bg_crop_pad_res256_freq20"
209
+ ],
210
+ "sin_cos_embedding_keys": null,
211
+ "mean_std_embedding_keys": null,
212
+ "min_max_embedding_keys": null,
213
+ "pass_through_keys": null,
214
+ "action_configs": null
215
+ },
216
+ "state": {
217
+ "delta_indices": [
218
+ 0
219
+ ],
220
+ "modality_keys": [
221
+ "left_arm",
222
+ "right_arm",
223
+ "left_hand",
224
+ "right_hand",
225
+ "waist"
226
+ ],
227
+ "sin_cos_embedding_keys": [
228
+ "left_arm",
229
+ "right_arm",
230
+ "left_hand",
231
+ "right_hand",
232
+ "waist"
233
+ ],
234
+ "mean_std_embedding_keys": null,
235
+ "min_max_embedding_keys": null,
236
+ "pass_through_keys": null,
237
+ "action_configs": null
238
+ },
239
+ "action": {
240
+ "delta_indices": [
241
+ 0,
242
+ 1,
243
+ 2,
244
+ 3,
245
+ 4,
246
+ 5,
247
+ 6,
248
+ 7,
249
+ 8,
250
+ 9,
251
+ 10,
252
+ 11,
253
+ 12,
254
+ 13,
255
+ 14,
256
+ 15
257
+ ],
258
+ "modality_keys": [
259
+ "left_arm",
260
+ "right_arm",
261
+ "left_hand",
262
+ "right_hand",
263
+ "waist"
264
+ ],
265
+ "sin_cos_embedding_keys": null,
266
+ "mean_std_embedding_keys": null,
267
+ "min_max_embedding_keys": null,
268
+ "pass_through_keys": null,
269
+ "action_configs": [
270
+ {
271
+ "rep": "RELATIVE",
272
+ "type": "NON_EEF",
273
+ "format": "DEFAULT",
274
+ "state_key": null,
275
+ "input_rotation_format": "quat",
276
+ "input_quat_order": "xyzw",
277
+ "reference_rotation_format": "rot6d",
278
+ "reference_quat_order": "xyzw",
279
+ "translation_scaling_key": null,
280
+ "rotation_scaling_key": null,
281
+ "hold_through_clutch": false,
282
+ "normalization_type": "percentile"
283
+ },
284
+ {
285
+ "rep": "RELATIVE",
286
+ "type": "NON_EEF",
287
+ "format": "DEFAULT",
288
+ "state_key": null,
289
+ "input_rotation_format": "quat",
290
+ "input_quat_order": "xyzw",
291
+ "reference_rotation_format": "rot6d",
292
+ "reference_quat_order": "xyzw",
293
+ "translation_scaling_key": null,
294
+ "rotation_scaling_key": null,
295
+ "hold_through_clutch": false,
296
+ "normalization_type": "percentile"
297
+ },
298
+ {
299
+ "rep": "RELATIVE",
300
+ "type": "NON_EEF",
301
+ "format": "DEFAULT",
302
+ "state_key": null,
303
+ "input_rotation_format": "quat",
304
+ "input_quat_order": "xyzw",
305
+ "reference_rotation_format": "rot6d",
306
+ "reference_quat_order": "xyzw",
307
+ "translation_scaling_key": null,
308
+ "rotation_scaling_key": null,
309
+ "hold_through_clutch": false,
310
+ "normalization_type": "percentile"
311
+ },
312
+ {
313
+ "rep": "RELATIVE",
314
+ "type": "NON_EEF",
315
+ "format": "DEFAULT",
316
+ "state_key": null,
317
+ "input_rotation_format": "quat",
318
+ "input_quat_order": "xyzw",
319
+ "reference_rotation_format": "rot6d",
320
+ "reference_quat_order": "xyzw",
321
+ "translation_scaling_key": null,
322
+ "rotation_scaling_key": null,
323
+ "hold_through_clutch": false,
324
+ "normalization_type": "percentile"
325
+ },
326
+ {
327
+ "rep": "ABSOLUTE",
328
+ "type": "NON_EEF",
329
+ "format": "DEFAULT",
330
+ "state_key": null,
331
+ "input_rotation_format": "quat",
332
+ "input_quat_order": "xyzw",
333
+ "reference_rotation_format": "rot6d",
334
+ "reference_quat_order": "xyzw",
335
+ "translation_scaling_key": null,
336
+ "rotation_scaling_key": null,
337
+ "hold_through_clutch": false,
338
+ "normalization_type": "percentile"
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "task"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "min_max_embedding_keys": null,
352
+ "pass_through_keys": null,
353
+ "action_configs": null
354
+ }
355
+ },
356
+ "robocasa_panda_omron": {
357
+ "video": {
358
+ "delta_indices": [
359
+ 0
360
+ ],
361
+ "modality_keys": [
362
+ "res256_image_side_0",
363
+ "res256_image_side_1",
364
+ "res256_image_wrist_0"
365
+ ],
366
+ "sin_cos_embedding_keys": null,
367
+ "mean_std_embedding_keys": null,
368
+ "min_max_embedding_keys": null,
369
+ "pass_through_keys": null,
370
+ "action_configs": null
371
+ },
372
+ "state": {
373
+ "delta_indices": [
374
+ 0
375
+ ],
376
+ "modality_keys": [
377
+ "end_effector_position_relative",
378
+ "end_effector_rotation_relative",
379
+ "gripper_qpos",
380
+ "base_position",
381
+ "base_rotation"
382
+ ],
383
+ "sin_cos_embedding_keys": null,
384
+ "mean_std_embedding_keys": null,
385
+ "min_max_embedding_keys": null,
386
+ "pass_through_keys": null,
387
+ "action_configs": null
388
+ },
389
+ "action": {
390
+ "delta_indices": [
391
+ 0,
392
+ 1,
393
+ 2,
394
+ 3,
395
+ 4,
396
+ 5,
397
+ 6,
398
+ 7,
399
+ 8,
400
+ 9,
401
+ 10,
402
+ 11,
403
+ 12,
404
+ 13,
405
+ 14,
406
+ 15
407
+ ],
408
+ "modality_keys": [
409
+ "end_effector_position",
410
+ "end_effector_rotation",
411
+ "gripper_close",
412
+ "base_motion",
413
+ "control_mode"
414
+ ],
415
+ "sin_cos_embedding_keys": null,
416
+ "mean_std_embedding_keys": null,
417
+ "min_max_embedding_keys": null,
418
+ "pass_through_keys": null,
419
+ "action_configs": [
420
+ {
421
+ "rep": "ABSOLUTE",
422
+ "type": "NON_EEF",
423
+ "format": "DEFAULT",
424
+ "state_key": null,
425
+ "input_rotation_format": "quat",
426
+ "input_quat_order": "xyzw",
427
+ "reference_rotation_format": "rot6d",
428
+ "reference_quat_order": "xyzw",
429
+ "translation_scaling_key": null,
430
+ "rotation_scaling_key": null,
431
+ "hold_through_clutch": false,
432
+ "normalization_type": "percentile"
433
+ },
434
+ {
435
+ "rep": "ABSOLUTE",
436
+ "type": "NON_EEF",
437
+ "format": "DEFAULT",
438
+ "state_key": null,
439
+ "input_rotation_format": "quat",
440
+ "input_quat_order": "xyzw",
441
+ "reference_rotation_format": "rot6d",
442
+ "reference_quat_order": "xyzw",
443
+ "translation_scaling_key": null,
444
+ "rotation_scaling_key": null,
445
+ "hold_through_clutch": false,
446
+ "normalization_type": "percentile"
447
+ },
448
+ {
449
+ "rep": "ABSOLUTE",
450
+ "type": "NON_EEF",
451
+ "format": "DEFAULT",
452
+ "state_key": null,
453
+ "input_rotation_format": "quat",
454
+ "input_quat_order": "xyzw",
455
+ "reference_rotation_format": "rot6d",
456
+ "reference_quat_order": "xyzw",
457
+ "translation_scaling_key": null,
458
+ "rotation_scaling_key": null,
459
+ "hold_through_clutch": false,
460
+ "normalization_type": "percentile"
461
+ },
462
+ {
463
+ "rep": "ABSOLUTE",
464
+ "type": "NON_EEF",
465
+ "format": "DEFAULT",
466
+ "state_key": null,
467
+ "input_rotation_format": "quat",
468
+ "input_quat_order": "xyzw",
469
+ "reference_rotation_format": "rot6d",
470
+ "reference_quat_order": "xyzw",
471
+ "translation_scaling_key": null,
472
+ "rotation_scaling_key": null,
473
+ "hold_through_clutch": false,
474
+ "normalization_type": "percentile"
475
+ },
476
+ {
477
+ "rep": "ABSOLUTE",
478
+ "type": "NON_EEF",
479
+ "format": "DEFAULT",
480
+ "state_key": null,
481
+ "input_rotation_format": "quat",
482
+ "input_quat_order": "xyzw",
483
+ "reference_rotation_format": "rot6d",
484
+ "reference_quat_order": "xyzw",
485
+ "translation_scaling_key": null,
486
+ "rotation_scaling_key": null,
487
+ "hold_through_clutch": false,
488
+ "normalization_type": "percentile"
489
+ }
490
+ ]
491
+ },
492
+ "language": {
493
+ "delta_indices": [
494
+ 0
495
+ ],
496
+ "modality_keys": [
497
+ "annotation.human.action.task_description"
498
+ ],
499
+ "sin_cos_embedding_keys": null,
500
+ "mean_std_embedding_keys": null,
501
+ "min_max_embedding_keys": null,
502
+ "pass_through_keys": null,
503
+ "action_configs": null
504
+ }
505
+ },
506
+ "cmr_versius": {
507
+ "video": {
508
+ "delta_indices": [
509
+ 0
510
+ ],
511
+ "modality_keys": [
512
+ "endoscope"
513
+ ],
514
+ "sin_cos_embedding_keys": null,
515
+ "mean_std_embedding_keys": null,
516
+ "min_max_embedding_keys": null,
517
+ "pass_through_keys": null,
518
+ "action_configs": null
519
+ },
520
+ "state": {
521
+ "delta_indices": [
522
+ 0
523
+ ],
524
+ "modality_keys": [
525
+ "left_pose",
526
+ "left_gripper",
527
+ "right_pose",
528
+ "right_gripper",
529
+ "translation_scaling",
530
+ "rotation_scaling",
531
+ "hapticengaged_left",
532
+ "hapticengaged_right"
533
+ ],
534
+ "sin_cos_embedding_keys": null,
535
+ "mean_std_embedding_keys": [
536
+ "left_pose",
537
+ "left_gripper",
538
+ "right_pose",
539
+ "right_gripper"
540
+ ],
541
+ "min_max_embedding_keys": null,
542
+ "pass_through_keys": [
543
+ "translation_scaling",
544
+ "rotation_scaling",
545
+ "hapticengaged_left",
546
+ "hapticengaged_right"
547
+ ],
548
+ "action_configs": null
549
+ },
550
+ "action": {
551
+ "delta_indices": [
552
+ 2,
553
+ 4,
554
+ 6,
555
+ 8,
556
+ 10,
557
+ 12,
558
+ 14,
559
+ 16,
560
+ 18,
561
+ 20,
562
+ 22,
563
+ 24,
564
+ 26,
565
+ 28,
566
+ 30,
567
+ 32,
568
+ 34,
569
+ 36,
570
+ 38,
571
+ 40,
572
+ 42,
573
+ 44,
574
+ 46,
575
+ 48,
576
+ 50,
577
+ 52,
578
+ 54,
579
+ 56,
580
+ 58,
581
+ 60,
582
+ 62,
583
+ 64,
584
+ 66,
585
+ 68,
586
+ 70,
587
+ 72,
588
+ 74,
589
+ 76,
590
+ 78,
591
+ 80,
592
+ 82,
593
+ 84,
594
+ 86,
595
+ 88,
596
+ 90,
597
+ 92,
598
+ 94,
599
+ 96,
600
+ 98,
601
+ 100
602
+ ],
603
+ "modality_keys": [
604
+ "left_pose",
605
+ "left_gripper",
606
+ "right_pose",
607
+ "right_gripper",
608
+ "hapticengaged_left",
609
+ "hapticengaged_right"
610
+ ],
611
+ "sin_cos_embedding_keys": null,
612
+ "mean_std_embedding_keys": null,
613
+ "min_max_embedding_keys": null,
614
+ "pass_through_keys": [
615
+ "hapticengaged_left",
616
+ "hapticengaged_right"
617
+ ],
618
+ "action_configs": [
619
+ {
620
+ "rep": "REL_XYZ_ROT6D",
621
+ "type": "EEF",
622
+ "format": "XYZ_ROT6D",
623
+ "state_key": "left_pose",
624
+ "input_rotation_format": "quat",
625
+ "input_quat_order": "xyzw",
626
+ "reference_rotation_format": "quat",
627
+ "reference_quat_order": "xyzw",
628
+ "translation_scaling_key": "translation_scaling",
629
+ "rotation_scaling_key": "rotation_scaling",
630
+ "hold_through_clutch": false,
631
+ "normalization_type": "percentile"
632
+ },
633
+ {
634
+ "rep": "ABSOLUTE",
635
+ "type": "NON_EEF",
636
+ "format": "DEFAULT",
637
+ "state_key": "left_gripper",
638
+ "input_rotation_format": "quat",
639
+ "input_quat_order": "xyzw",
640
+ "reference_rotation_format": "rot6d",
641
+ "reference_quat_order": "xyzw",
642
+ "translation_scaling_key": null,
643
+ "rotation_scaling_key": null,
644
+ "hold_through_clutch": true,
645
+ "normalization_type": "percentile"
646
+ },
647
+ {
648
+ "rep": "REL_XYZ_ROT6D",
649
+ "type": "EEF",
650
+ "format": "XYZ_ROT6D",
651
+ "state_key": "right_pose",
652
+ "input_rotation_format": "quat",
653
+ "input_quat_order": "xyzw",
654
+ "reference_rotation_format": "quat",
655
+ "reference_quat_order": "xyzw",
656
+ "translation_scaling_key": "translation_scaling",
657
+ "rotation_scaling_key": "rotation_scaling",
658
+ "hold_through_clutch": false,
659
+ "normalization_type": "percentile"
660
+ },
661
+ {
662
+ "rep": "ABSOLUTE",
663
+ "type": "NON_EEF",
664
+ "format": "DEFAULT",
665
+ "state_key": "right_gripper",
666
+ "input_rotation_format": "quat",
667
+ "input_quat_order": "xyzw",
668
+ "reference_rotation_format": "rot6d",
669
+ "reference_quat_order": "xyzw",
670
+ "translation_scaling_key": null,
671
+ "rotation_scaling_key": null,
672
+ "hold_through_clutch": true,
673
+ "normalization_type": "percentile"
674
+ },
675
+ {
676
+ "rep": "ABSOLUTE",
677
+ "type": "NON_EEF",
678
+ "format": "DEFAULT",
679
+ "state_key": null,
680
+ "input_rotation_format": "quat",
681
+ "input_quat_order": "xyzw",
682
+ "reference_rotation_format": "rot6d",
683
+ "reference_quat_order": "xyzw",
684
+ "translation_scaling_key": null,
685
+ "rotation_scaling_key": null,
686
+ "hold_through_clutch": false,
687
+ "normalization_type": "skip"
688
+ },
689
+ {
690
+ "rep": "ABSOLUTE",
691
+ "type": "NON_EEF",
692
+ "format": "DEFAULT",
693
+ "state_key": null,
694
+ "input_rotation_format": "quat",
695
+ "input_quat_order": "xyzw",
696
+ "reference_rotation_format": "rot6d",
697
+ "reference_quat_order": "xyzw",
698
+ "translation_scaling_key": null,
699
+ "rotation_scaling_key": null,
700
+ "hold_through_clutch": false,
701
+ "normalization_type": "skip"
702
+ }
703
+ ]
704
+ },
705
+ "language": {
706
+ "delta_indices": [
707
+ 0
708
+ ],
709
+ "modality_keys": [
710
+ "annotation.human.task_description"
711
+ ],
712
+ "sin_cos_embedding_keys": null,
713
+ "mean_std_embedding_keys": null,
714
+ "min_max_embedding_keys": null,
715
+ "pass_through_keys": null,
716
+ "action_configs": null
717
+ }
718
+ },
719
+ "ucsd_dvrk": {
720
+ "video": {
721
+ "delta_indices": [
722
+ 0
723
+ ],
724
+ "modality_keys": [
725
+ "camera_left"
726
+ ],
727
+ "sin_cos_embedding_keys": null,
728
+ "mean_std_embedding_keys": null,
729
+ "min_max_embedding_keys": null,
730
+ "pass_through_keys": null,
731
+ "action_configs": null
732
+ },
733
+ "state": {
734
+ "delta_indices": [
735
+ 0
736
+ ],
737
+ "modality_keys": [
738
+ "psm_retraction_pose",
739
+ "psm_retraction_gripper",
740
+ "psm_cutter_pose",
741
+ "psm_cutter_gripper"
742
+ ],
743
+ "sin_cos_embedding_keys": null,
744
+ "mean_std_embedding_keys": [
745
+ "psm_retraction_pose",
746
+ "psm_retraction_gripper",
747
+ "psm_cutter_pose",
748
+ "psm_cutter_gripper"
749
+ ],
750
+ "min_max_embedding_keys": null,
751
+ "pass_through_keys": null,
752
+ "action_configs": null
753
+ },
754
+ "action": {
755
+ "delta_indices": [
756
+ 1,
757
+ 2,
758
+ 3,
759
+ 4,
760
+ 5,
761
+ 6,
762
+ 7,
763
+ 8,
764
+ 9,
765
+ 10,
766
+ 11,
767
+ 12,
768
+ 13,
769
+ 14,
770
+ 15,
771
+ 16,
772
+ 17,
773
+ 18,
774
+ 19,
775
+ 20,
776
+ 21,
777
+ 22,
778
+ 23,
779
+ 24,
780
+ 25,
781
+ 26,
782
+ 27,
783
+ 28,
784
+ 29,
785
+ 30,
786
+ 31,
787
+ 32,
788
+ 33,
789
+ 34,
790
+ 35,
791
+ 36,
792
+ 37,
793
+ 38,
794
+ 39,
795
+ 40,
796
+ 41,
797
+ 42,
798
+ 43,
799
+ 44,
800
+ 45,
801
+ 46,
802
+ 47,
803
+ 48,
804
+ 49,
805
+ 50
806
+ ],
807
+ "modality_keys": [
808
+ "psm_retraction_pose",
809
+ "psm_retraction_gripper",
810
+ "psm_cutter_pose",
811
+ "psm_cutter_gripper"
812
+ ],
813
+ "sin_cos_embedding_keys": null,
814
+ "mean_std_embedding_keys": null,
815
+ "min_max_embedding_keys": null,
816
+ "pass_through_keys": null,
817
+ "action_configs": [
818
+ {
819
+ "rep": "REL_XYZ_ROT6D",
820
+ "type": "EEF",
821
+ "format": "XYZ_ROT6D",
822
+ "state_key": "psm_retraction_pose",
823
+ "input_rotation_format": "quat",
824
+ "input_quat_order": "wxyz",
825
+ "reference_rotation_format": "quat",
826
+ "reference_quat_order": "wxyz",
827
+ "translation_scaling_key": null,
828
+ "rotation_scaling_key": null,
829
+ "hold_through_clutch": false,
830
+ "normalization_type": "percentile"
831
+ },
832
+ {
833
+ "rep": "ABSOLUTE",
834
+ "type": "NON_EEF",
835
+ "format": "DEFAULT",
836
+ "state_key": null,
837
+ "input_rotation_format": "quat",
838
+ "input_quat_order": "xyzw",
839
+ "reference_rotation_format": "rot6d",
840
+ "reference_quat_order": "xyzw",
841
+ "translation_scaling_key": null,
842
+ "rotation_scaling_key": null,
843
+ "hold_through_clutch": false,
844
+ "normalization_type": "percentile"
845
+ },
846
+ {
847
+ "rep": "REL_XYZ_ROT6D",
848
+ "type": "EEF",
849
+ "format": "XYZ_ROT6D",
850
+ "state_key": "psm_cutter_pose",
851
+ "input_rotation_format": "quat",
852
+ "input_quat_order": "wxyz",
853
+ "reference_rotation_format": "quat",
854
+ "reference_quat_order": "wxyz",
855
+ "translation_scaling_key": null,
856
+ "rotation_scaling_key": null,
857
+ "hold_through_clutch": false,
858
+ "normalization_type": "percentile"
859
+ },
860
+ {
861
+ "rep": "ABSOLUTE",
862
+ "type": "NON_EEF",
863
+ "format": "DEFAULT",
864
+ "state_key": null,
865
+ "input_rotation_format": "quat",
866
+ "input_quat_order": "xyzw",
867
+ "reference_rotation_format": "rot6d",
868
+ "reference_quat_order": "xyzw",
869
+ "translation_scaling_key": null,
870
+ "rotation_scaling_key": null,
871
+ "hold_through_clutch": false,
872
+ "normalization_type": "percentile"
873
+ }
874
+ ]
875
+ },
876
+ "language": {
877
+ "delta_indices": [
878
+ 0
879
+ ],
880
+ "modality_keys": [
881
+ "task"
882
+ ],
883
+ "sin_cos_embedding_keys": null,
884
+ "mean_std_embedding_keys": null,
885
+ "min_max_embedding_keys": null,
886
+ "pass_through_keys": null,
887
+ "action_configs": null
888
+ }
889
+ },
890
+ "jhu_imerse_dvrk": {
891
+ "video": {
892
+ "delta_indices": [
893
+ 0
894
+ ],
895
+ "modality_keys": [
896
+ "endoscope_left",
897
+ "wrist_left",
898
+ "wrist_right"
899
+ ],
900
+ "sin_cos_embedding_keys": null,
901
+ "mean_std_embedding_keys": null,
902
+ "min_max_embedding_keys": null,
903
+ "pass_through_keys": null,
904
+ "action_configs": null
905
+ },
906
+ "state": {
907
+ "delta_indices": [
908
+ 0
909
+ ],
910
+ "modality_keys": [
911
+ "psm1_pose",
912
+ "psm1_gripper",
913
+ "psm2_pose",
914
+ "psm2_gripper"
915
+ ],
916
+ "sin_cos_embedding_keys": null,
917
+ "mean_std_embedding_keys": [
918
+ "psm1_pose",
919
+ "psm1_gripper",
920
+ "psm2_pose",
921
+ "psm2_gripper"
922
+ ],
923
+ "min_max_embedding_keys": null,
924
+ "pass_through_keys": null,
925
+ "action_configs": null
926
+ },
927
+ "action": {
928
+ "delta_indices": [
929
+ 1,
930
+ 2,
931
+ 3,
932
+ 4,
933
+ 5,
934
+ 6,
935
+ 7,
936
+ 8,
937
+ 9,
938
+ 10,
939
+ 11,
940
+ 12,
941
+ 13,
942
+ 14,
943
+ 15,
944
+ 16,
945
+ 17,
946
+ 18,
947
+ 19,
948
+ 20,
949
+ 21,
950
+ 22,
951
+ 23,
952
+ 24,
953
+ 25,
954
+ 26,
955
+ 27,
956
+ 28,
957
+ 29,
958
+ 30,
959
+ 31,
960
+ 32,
961
+ 33,
962
+ 34,
963
+ 35,
964
+ 36,
965
+ 37,
966
+ 38,
967
+ 39,
968
+ 40,
969
+ 41,
970
+ 42,
971
+ 43,
972
+ 44,
973
+ 45,
974
+ 46,
975
+ 47,
976
+ 48,
977
+ 49,
978
+ 50
979
+ ],
980
+ "modality_keys": [
981
+ "psm1_pose",
982
+ "psm1_gripper",
983
+ "psm2_pose",
984
+ "psm2_gripper"
985
+ ],
986
+ "sin_cos_embedding_keys": null,
987
+ "mean_std_embedding_keys": null,
988
+ "min_max_embedding_keys": null,
989
+ "pass_through_keys": null,
990
+ "action_configs": [
991
+ {
992
+ "rep": "REL_XYZ_ROT6D",
993
+ "type": "EEF",
994
+ "format": "XYZ_ROT6D",
995
+ "state_key": "psm1_pose",
996
+ "input_rotation_format": "quat",
997
+ "input_quat_order": "xyzw",
998
+ "reference_rotation_format": "quat",
999
+ "reference_quat_order": "xyzw",
1000
+ "translation_scaling_key": null,
1001
+ "rotation_scaling_key": null,
1002
+ "hold_through_clutch": false,
1003
+ "normalization_type": "percentile"
1004
+ },
1005
+ {
1006
+ "rep": "ABSOLUTE",
1007
+ "type": "NON_EEF",
1008
+ "format": "DEFAULT",
1009
+ "state_key": null,
1010
+ "input_rotation_format": "quat",
1011
+ "input_quat_order": "xyzw",
1012
+ "reference_rotation_format": "rot6d",
1013
+ "reference_quat_order": "xyzw",
1014
+ "translation_scaling_key": null,
1015
+ "rotation_scaling_key": null,
1016
+ "hold_through_clutch": false,
1017
+ "normalization_type": "percentile"
1018
+ },
1019
+ {
1020
+ "rep": "REL_XYZ_ROT6D",
1021
+ "type": "EEF",
1022
+ "format": "XYZ_ROT6D",
1023
+ "state_key": "psm2_pose",
1024
+ "input_rotation_format": "quat",
1025
+ "input_quat_order": "xyzw",
1026
+ "reference_rotation_format": "quat",
1027
+ "reference_quat_order": "xyzw",
1028
+ "translation_scaling_key": null,
1029
+ "rotation_scaling_key": null,
1030
+ "hold_through_clutch": false,
1031
+ "normalization_type": "percentile"
1032
+ },
1033
+ {
1034
+ "rep": "ABSOLUTE",
1035
+ "type": "NON_EEF",
1036
+ "format": "DEFAULT",
1037
+ "state_key": null,
1038
+ "input_rotation_format": "quat",
1039
+ "input_quat_order": "xyzw",
1040
+ "reference_rotation_format": "rot6d",
1041
+ "reference_quat_order": "xyzw",
1042
+ "translation_scaling_key": null,
1043
+ "rotation_scaling_key": null,
1044
+ "hold_through_clutch": false,
1045
+ "normalization_type": "percentile"
1046
+ }
1047
+ ]
1048
+ },
1049
+ "language": {
1050
+ "delta_indices": [
1051
+ 0
1052
+ ],
1053
+ "modality_keys": [
1054
+ "annotation.human.task_description"
1055
+ ],
1056
+ "sin_cos_embedding_keys": null,
1057
+ "mean_std_embedding_keys": null,
1058
+ "min_max_embedding_keys": null,
1059
+ "pass_through_keys": null,
1060
+ "action_configs": null
1061
+ }
1062
+ },
1063
+ "obuda_dvrk": {
1064
+ "video": {
1065
+ "delta_indices": [
1066
+ 0
1067
+ ],
1068
+ "modality_keys": [
1069
+ "endoscope_left",
1070
+ "wrist_left",
1071
+ "wrist_right"
1072
+ ],
1073
+ "sin_cos_embedding_keys": null,
1074
+ "mean_std_embedding_keys": null,
1075
+ "min_max_embedding_keys": null,
1076
+ "pass_through_keys": null,
1077
+ "action_configs": null
1078
+ },
1079
+ "state": {
1080
+ "delta_indices": [
1081
+ 0
1082
+ ],
1083
+ "modality_keys": [
1084
+ "psm1_pose",
1085
+ "psm1_gripper",
1086
+ "psm2_pose",
1087
+ "psm2_gripper"
1088
+ ],
1089
+ "sin_cos_embedding_keys": null,
1090
+ "mean_std_embedding_keys": [
1091
+ "psm1_pose",
1092
+ "psm1_gripper",
1093
+ "psm2_pose",
1094
+ "psm2_gripper"
1095
+ ],
1096
+ "min_max_embedding_keys": null,
1097
+ "pass_through_keys": null,
1098
+ "action_configs": null
1099
+ },
1100
+ "action": {
1101
+ "delta_indices": [
1102
+ 0,
1103
+ 1,
1104
+ 2,
1105
+ 3,
1106
+ 4,
1107
+ 5,
1108
+ 6,
1109
+ 7,
1110
+ 8,
1111
+ 9,
1112
+ 10,
1113
+ 11,
1114
+ 12,
1115
+ 13,
1116
+ 14,
1117
+ 15,
1118
+ 16,
1119
+ 17,
1120
+ 18,
1121
+ 19,
1122
+ 20,
1123
+ 21,
1124
+ 22,
1125
+ 23,
1126
+ 24,
1127
+ 25,
1128
+ 26,
1129
+ 27,
1130
+ 28,
1131
+ 29,
1132
+ 30,
1133
+ 31,
1134
+ 32,
1135
+ 33,
1136
+ 34,
1137
+ 35,
1138
+ 36,
1139
+ 37,
1140
+ 38,
1141
+ 39,
1142
+ 40,
1143
+ 41,
1144
+ 42,
1145
+ 43,
1146
+ 44,
1147
+ 45,
1148
+ 46,
1149
+ 47,
1150
+ 48,
1151
+ 49
1152
+ ],
1153
+ "modality_keys": [
1154
+ "psm1_pose",
1155
+ "psm1_gripper",
1156
+ "psm2_pose",
1157
+ "psm2_gripper"
1158
+ ],
1159
+ "sin_cos_embedding_keys": null,
1160
+ "mean_std_embedding_keys": null,
1161
+ "min_max_embedding_keys": null,
1162
+ "pass_through_keys": null,
1163
+ "action_configs": [
1164
+ {
1165
+ "rep": "REL_XYZ_ROT6D",
1166
+ "type": "EEF",
1167
+ "format": "XYZ_ROT6D",
1168
+ "state_key": "psm1_pose",
1169
+ "input_rotation_format": "quat",
1170
+ "input_quat_order": "xyzw",
1171
+ "reference_rotation_format": "quat",
1172
+ "reference_quat_order": "xyzw",
1173
+ "translation_scaling_key": null,
1174
+ "rotation_scaling_key": null,
1175
+ "hold_through_clutch": false,
1176
+ "normalization_type": "percentile"
1177
+ },
1178
+ {
1179
+ "rep": "ABSOLUTE",
1180
+ "type": "NON_EEF",
1181
+ "format": "DEFAULT",
1182
+ "state_key": null,
1183
+ "input_rotation_format": "quat",
1184
+ "input_quat_order": "xyzw",
1185
+ "reference_rotation_format": "rot6d",
1186
+ "reference_quat_order": "xyzw",
1187
+ "translation_scaling_key": null,
1188
+ "rotation_scaling_key": null,
1189
+ "hold_through_clutch": false,
1190
+ "normalization_type": "percentile"
1191
+ },
1192
+ {
1193
+ "rep": "REL_XYZ_ROT6D",
1194
+ "type": "EEF",
1195
+ "format": "XYZ_ROT6D",
1196
+ "state_key": "psm2_pose",
1197
+ "input_rotation_format": "quat",
1198
+ "input_quat_order": "xyzw",
1199
+ "reference_rotation_format": "quat",
1200
+ "reference_quat_order": "xyzw",
1201
+ "translation_scaling_key": null,
1202
+ "rotation_scaling_key": null,
1203
+ "hold_through_clutch": false,
1204
+ "normalization_type": "percentile"
1205
+ },
1206
+ {
1207
+ "rep": "ABSOLUTE",
1208
+ "type": "NON_EEF",
1209
+ "format": "DEFAULT",
1210
+ "state_key": null,
1211
+ "input_rotation_format": "quat",
1212
+ "input_quat_order": "xyzw",
1213
+ "reference_rotation_format": "rot6d",
1214
+ "reference_quat_order": "xyzw",
1215
+ "translation_scaling_key": null,
1216
+ "rotation_scaling_key": null,
1217
+ "hold_through_clutch": false,
1218
+ "normalization_type": "percentile"
1219
+ }
1220
+ ]
1221
+ },
1222
+ "language": {
1223
+ "delta_indices": [
1224
+ 0
1225
+ ],
1226
+ "modality_keys": [
1227
+ "task"
1228
+ ],
1229
+ "sin_cos_embedding_keys": null,
1230
+ "mean_std_embedding_keys": null,
1231
+ "min_max_embedding_keys": null,
1232
+ "pass_through_keys": null,
1233
+ "action_configs": null
1234
+ }
1235
+ },
1236
+ "stanford_dvrk_real": {
1237
+ "video": {
1238
+ "delta_indices": [
1239
+ 0
1240
+ ],
1241
+ "modality_keys": [
1242
+ "endoscope_left"
1243
+ ],
1244
+ "sin_cos_embedding_keys": null,
1245
+ "mean_std_embedding_keys": null,
1246
+ "min_max_embedding_keys": null,
1247
+ "pass_through_keys": null,
1248
+ "action_configs": null
1249
+ },
1250
+ "state": {
1251
+ "delta_indices": [
1252
+ 0
1253
+ ],
1254
+ "modality_keys": [
1255
+ "psm1_pose",
1256
+ "psm1_gripper",
1257
+ "psm2_pose",
1258
+ "psm2_gripper"
1259
+ ],
1260
+ "sin_cos_embedding_keys": null,
1261
+ "mean_std_embedding_keys": [
1262
+ "psm1_pose",
1263
+ "psm1_gripper",
1264
+ "psm2_pose",
1265
+ "psm2_gripper"
1266
+ ],
1267
+ "min_max_embedding_keys": null,
1268
+ "pass_through_keys": null,
1269
+ "action_configs": null
1270
+ },
1271
+ "action": {
1272
+ "delta_indices": [
1273
+ 0,
1274
+ 1,
1275
+ 2,
1276
+ 3,
1277
+ 4,
1278
+ 5,
1279
+ 6,
1280
+ 7,
1281
+ 8,
1282
+ 9,
1283
+ 10,
1284
+ 11,
1285
+ 12,
1286
+ 13,
1287
+ 14,
1288
+ 15,
1289
+ 16,
1290
+ 17,
1291
+ 18,
1292
+ 19,
1293
+ 20,
1294
+ 21,
1295
+ 22,
1296
+ 23,
1297
+ 24,
1298
+ 25,
1299
+ 26,
1300
+ 27,
1301
+ 28,
1302
+ 29,
1303
+ 30,
1304
+ 31,
1305
+ 32,
1306
+ 33,
1307
+ 34,
1308
+ 35,
1309
+ 36,
1310
+ 37,
1311
+ 38,
1312
+ 39,
1313
+ 40,
1314
+ 41,
1315
+ 42,
1316
+ 43,
1317
+ 44,
1318
+ 45,
1319
+ 46,
1320
+ 47,
1321
+ 48,
1322
+ 49
1323
+ ],
1324
+ "modality_keys": [
1325
+ "psm1_pose",
1326
+ "psm1_gripper",
1327
+ "psm2_pose",
1328
+ "psm2_gripper"
1329
+ ],
1330
+ "sin_cos_embedding_keys": null,
1331
+ "mean_std_embedding_keys": null,
1332
+ "min_max_embedding_keys": null,
1333
+ "pass_through_keys": null,
1334
+ "action_configs": [
1335
+ {
1336
+ "rep": "REL_XYZ_ROT6D",
1337
+ "type": "EEF",
1338
+ "format": "XYZ_ROT6D",
1339
+ "state_key": "psm1_pose",
1340
+ "input_rotation_format": "euler",
1341
+ "input_quat_order": "xyzw",
1342
+ "reference_rotation_format": "euler",
1343
+ "reference_quat_order": "xyzw",
1344
+ "translation_scaling_key": null,
1345
+ "rotation_scaling_key": null,
1346
+ "hold_through_clutch": false,
1347
+ "normalization_type": "percentile"
1348
+ },
1349
+ {
1350
+ "rep": "ABSOLUTE",
1351
+ "type": "NON_EEF",
1352
+ "format": "DEFAULT",
1353
+ "state_key": null,
1354
+ "input_rotation_format": "quat",
1355
+ "input_quat_order": "xyzw",
1356
+ "reference_rotation_format": "rot6d",
1357
+ "reference_quat_order": "xyzw",
1358
+ "translation_scaling_key": null,
1359
+ "rotation_scaling_key": null,
1360
+ "hold_through_clutch": false,
1361
+ "normalization_type": "percentile"
1362
+ },
1363
+ {
1364
+ "rep": "REL_XYZ_ROT6D",
1365
+ "type": "EEF",
1366
+ "format": "XYZ_ROT6D",
1367
+ "state_key": "psm2_pose",
1368
+ "input_rotation_format": "euler",
1369
+ "input_quat_order": "xyzw",
1370
+ "reference_rotation_format": "euler",
1371
+ "reference_quat_order": "xyzw",
1372
+ "translation_scaling_key": null,
1373
+ "rotation_scaling_key": null,
1374
+ "hold_through_clutch": false,
1375
+ "normalization_type": "percentile"
1376
+ },
1377
+ {
1378
+ "rep": "ABSOLUTE",
1379
+ "type": "NON_EEF",
1380
+ "format": "DEFAULT",
1381
+ "state_key": null,
1382
+ "input_rotation_format": "quat",
1383
+ "input_quat_order": "xyzw",
1384
+ "reference_rotation_format": "rot6d",
1385
+ "reference_quat_order": "xyzw",
1386
+ "translation_scaling_key": null,
1387
+ "rotation_scaling_key": null,
1388
+ "hold_through_clutch": false,
1389
+ "normalization_type": "percentile"
1390
+ }
1391
+ ]
1392
+ },
1393
+ "language": {
1394
+ "delta_indices": [
1395
+ 0
1396
+ ],
1397
+ "modality_keys": [
1398
+ "task"
1399
+ ],
1400
+ "sin_cos_embedding_keys": null,
1401
+ "mean_std_embedding_keys": null,
1402
+ "min_max_embedding_keys": null,
1403
+ "pass_through_keys": null,
1404
+ "action_configs": null
1405
+ }
1406
+ },
1407
+ "tud_tundra_ur5e": {
1408
+ "video": {
1409
+ "delta_indices": [
1410
+ 0
1411
+ ],
1412
+ "modality_keys": [
1413
+ "laparoscope_left"
1414
+ ],
1415
+ "sin_cos_embedding_keys": null,
1416
+ "mean_std_embedding_keys": null,
1417
+ "min_max_embedding_keys": null,
1418
+ "pass_through_keys": null,
1419
+ "action_configs": null
1420
+ },
1421
+ "state": {
1422
+ "delta_indices": [
1423
+ 0
1424
+ ],
1425
+ "modality_keys": [
1426
+ "joint_position",
1427
+ "eef_pose"
1428
+ ],
1429
+ "sin_cos_embedding_keys": null,
1430
+ "mean_std_embedding_keys": [
1431
+ "joint_position"
1432
+ ],
1433
+ "min_max_embedding_keys": null,
1434
+ "pass_through_keys": [
1435
+ "eef_pose"
1436
+ ],
1437
+ "action_configs": null
1438
+ },
1439
+ "action": {
1440
+ "delta_indices": [
1441
+ 1,
1442
+ 2,
1443
+ 3,
1444
+ 4,
1445
+ 5,
1446
+ 6,
1447
+ 7,
1448
+ 8,
1449
+ 9,
1450
+ 10,
1451
+ 11,
1452
+ 12,
1453
+ 13,
1454
+ 14,
1455
+ 15,
1456
+ 16,
1457
+ 17,
1458
+ 18,
1459
+ 19,
1460
+ 20,
1461
+ 21,
1462
+ 22,
1463
+ 23,
1464
+ 24,
1465
+ 25,
1466
+ 26,
1467
+ 27,
1468
+ 28,
1469
+ 29,
1470
+ 30,
1471
+ 31,
1472
+ 32,
1473
+ 33,
1474
+ 34,
1475
+ 35,
1476
+ 36,
1477
+ 37,
1478
+ 38,
1479
+ 39,
1480
+ 40,
1481
+ 41,
1482
+ 42,
1483
+ 43,
1484
+ 44,
1485
+ 45,
1486
+ 46,
1487
+ 47,
1488
+ 48,
1489
+ 49,
1490
+ 50
1491
+ ],
1492
+ "modality_keys": [
1493
+ "eef_pose",
1494
+ "gripper"
1495
+ ],
1496
+ "sin_cos_embedding_keys": null,
1497
+ "mean_std_embedding_keys": null,
1498
+ "min_max_embedding_keys": null,
1499
+ "pass_through_keys": null,
1500
+ "action_configs": [
1501
+ {
1502
+ "rep": "REL_XYZ_ROT6D",
1503
+ "type": "EEF",
1504
+ "format": "XYZ_ROT6D",
1505
+ "state_key": "eef_pose",
1506
+ "input_rotation_format": "quat",
1507
+ "input_quat_order": "xyzw",
1508
+ "reference_rotation_format": "quat",
1509
+ "reference_quat_order": "xyzw",
1510
+ "translation_scaling_key": null,
1511
+ "rotation_scaling_key": null,
1512
+ "hold_through_clutch": false,
1513
+ "normalization_type": "percentile"
1514
+ },
1515
+ {
1516
+ "rep": "ABSOLUTE",
1517
+ "type": "NON_EEF",
1518
+ "format": "DEFAULT",
1519
+ "state_key": null,
1520
+ "input_rotation_format": "quat",
1521
+ "input_quat_order": "xyzw",
1522
+ "reference_rotation_format": "rot6d",
1523
+ "reference_quat_order": "xyzw",
1524
+ "translation_scaling_key": null,
1525
+ "rotation_scaling_key": null,
1526
+ "hold_through_clutch": false,
1527
+ "normalization_type": "percentile"
1528
+ }
1529
+ ]
1530
+ },
1531
+ "language": {
1532
+ "delta_indices": [
1533
+ 0
1534
+ ],
1535
+ "modality_keys": [
1536
+ "task"
1537
+ ],
1538
+ "sin_cos_embedding_keys": null,
1539
+ "mean_std_embedding_keys": null,
1540
+ "min_max_embedding_keys": null,
1541
+ "pass_through_keys": null,
1542
+ "action_configs": null
1543
+ }
1544
+ },
1545
+ "jhu_lscr_dvrk_smarts": {
1546
+ "video": {
1547
+ "delta_indices": [
1548
+ 0
1549
+ ],
1550
+ "modality_keys": [
1551
+ "endoscope_left",
1552
+ "camera_side_view"
1553
+ ],
1554
+ "sin_cos_embedding_keys": null,
1555
+ "mean_std_embedding_keys": null,
1556
+ "min_max_embedding_keys": null,
1557
+ "pass_through_keys": null,
1558
+ "action_configs": null
1559
+ },
1560
+ "state": {
1561
+ "delta_indices": [
1562
+ 0
1563
+ ],
1564
+ "modality_keys": [
1565
+ "psm1_pose",
1566
+ "psm1_gripper",
1567
+ "psm2_pose",
1568
+ "psm2_gripper"
1569
+ ],
1570
+ "sin_cos_embedding_keys": null,
1571
+ "mean_std_embedding_keys": [
1572
+ "psm1_pose",
1573
+ "psm1_gripper",
1574
+ "psm2_pose",
1575
+ "psm2_gripper"
1576
+ ],
1577
+ "min_max_embedding_keys": null,
1578
+ "pass_through_keys": null,
1579
+ "action_configs": null
1580
+ },
1581
+ "action": {
1582
+ "delta_indices": [
1583
+ 1,
1584
+ 2,
1585
+ 3,
1586
+ 4,
1587
+ 5,
1588
+ 6,
1589
+ 7,
1590
+ 8,
1591
+ 9,
1592
+ 10,
1593
+ 11,
1594
+ 12,
1595
+ 13,
1596
+ 14,
1597
+ 15,
1598
+ 16
1599
+ ],
1600
+ "modality_keys": [
1601
+ "psm1_pose",
1602
+ "psm1_gripper",
1603
+ "psm2_pose",
1604
+ "psm2_gripper"
1605
+ ],
1606
+ "sin_cos_embedding_keys": null,
1607
+ "mean_std_embedding_keys": null,
1608
+ "min_max_embedding_keys": null,
1609
+ "pass_through_keys": null,
1610
+ "action_configs": [
1611
+ {
1612
+ "rep": "REL_XYZ_ROT6D",
1613
+ "type": "EEF",
1614
+ "format": "XYZ_ROT6D",
1615
+ "state_key": "psm1_pose",
1616
+ "input_rotation_format": "quat",
1617
+ "input_quat_order": "xyzw",
1618
+ "reference_rotation_format": "quat",
1619
+ "reference_quat_order": "xyzw",
1620
+ "translation_scaling_key": null,
1621
+ "rotation_scaling_key": null,
1622
+ "hold_through_clutch": false,
1623
+ "normalization_type": "percentile"
1624
+ },
1625
+ {
1626
+ "rep": "ABSOLUTE",
1627
+ "type": "NON_EEF",
1628
+ "format": "DEFAULT",
1629
+ "state_key": null,
1630
+ "input_rotation_format": "quat",
1631
+ "input_quat_order": "xyzw",
1632
+ "reference_rotation_format": "rot6d",
1633
+ "reference_quat_order": "xyzw",
1634
+ "translation_scaling_key": null,
1635
+ "rotation_scaling_key": null,
1636
+ "hold_through_clutch": false,
1637
+ "normalization_type": "percentile"
1638
+ },
1639
+ {
1640
+ "rep": "REL_XYZ_ROT6D",
1641
+ "type": "EEF",
1642
+ "format": "XYZ_ROT6D",
1643
+ "state_key": "psm2_pose",
1644
+ "input_rotation_format": "quat",
1645
+ "input_quat_order": "xyzw",
1646
+ "reference_rotation_format": "quat",
1647
+ "reference_quat_order": "xyzw",
1648
+ "translation_scaling_key": null,
1649
+ "rotation_scaling_key": null,
1650
+ "hold_through_clutch": false,
1651
+ "normalization_type": "percentile"
1652
+ },
1653
+ {
1654
+ "rep": "ABSOLUTE",
1655
+ "type": "NON_EEF",
1656
+ "format": "DEFAULT",
1657
+ "state_key": null,
1658
+ "input_rotation_format": "quat",
1659
+ "input_quat_order": "xyzw",
1660
+ "reference_rotation_format": "rot6d",
1661
+ "reference_quat_order": "xyzw",
1662
+ "translation_scaling_key": null,
1663
+ "rotation_scaling_key": null,
1664
+ "hold_through_clutch": false,
1665
+ "normalization_type": "percentile"
1666
+ }
1667
+ ]
1668
+ },
1669
+ "language": {
1670
+ "delta_indices": [
1671
+ 0
1672
+ ],
1673
+ "modality_keys": [
1674
+ "annotation.task"
1675
+ ],
1676
+ "sin_cos_embedding_keys": null,
1677
+ "mean_std_embedding_keys": null,
1678
+ "min_max_embedding_keys": null,
1679
+ "pass_through_keys": null,
1680
+ "action_configs": null
1681
+ }
1682
+ },
1683
+ "jhu_imerse_dvrk_mono": {
1684
+ "video": {
1685
+ "delta_indices": [
1686
+ 0
1687
+ ],
1688
+ "modality_keys": [
1689
+ "endoscope_left"
1690
+ ],
1691
+ "sin_cos_embedding_keys": null,
1692
+ "mean_std_embedding_keys": null,
1693
+ "min_max_embedding_keys": null,
1694
+ "pass_through_keys": null,
1695
+ "action_configs": null
1696
+ },
1697
+ "state": {
1698
+ "delta_indices": [
1699
+ 0
1700
+ ],
1701
+ "modality_keys": [
1702
+ "psm1_pose",
1703
+ "psm1_gripper",
1704
+ "psm2_pose",
1705
+ "psm2_gripper"
1706
+ ],
1707
+ "sin_cos_embedding_keys": null,
1708
+ "mean_std_embedding_keys": [
1709
+ "psm1_pose",
1710
+ "psm1_gripper",
1711
+ "psm2_pose",
1712
+ "psm2_gripper"
1713
+ ],
1714
+ "min_max_embedding_keys": null,
1715
+ "pass_through_keys": null,
1716
+ "action_configs": null
1717
+ },
1718
+ "action": {
1719
+ "delta_indices": [
1720
+ 0,
1721
+ 1,
1722
+ 2,
1723
+ 3,
1724
+ 4,
1725
+ 5,
1726
+ 6,
1727
+ 7,
1728
+ 8,
1729
+ 9,
1730
+ 10,
1731
+ 11,
1732
+ 12,
1733
+ 13,
1734
+ 14,
1735
+ 15,
1736
+ 16,
1737
+ 17,
1738
+ 18,
1739
+ 19,
1740
+ 20,
1741
+ 21,
1742
+ 22,
1743
+ 23,
1744
+ 24,
1745
+ 25,
1746
+ 26,
1747
+ 27,
1748
+ 28,
1749
+ 29,
1750
+ 30,
1751
+ 31,
1752
+ 32,
1753
+ 33,
1754
+ 34,
1755
+ 35,
1756
+ 36,
1757
+ 37,
1758
+ 38,
1759
+ 39,
1760
+ 40,
1761
+ 41,
1762
+ 42,
1763
+ 43,
1764
+ 44,
1765
+ 45,
1766
+ 46,
1767
+ 47,
1768
+ 48,
1769
+ 49
1770
+ ],
1771
+ "modality_keys": [
1772
+ "psm1_pose",
1773
+ "psm1_gripper",
1774
+ "psm2_pose",
1775
+ "psm2_gripper"
1776
+ ],
1777
+ "sin_cos_embedding_keys": null,
1778
+ "mean_std_embedding_keys": null,
1779
+ "min_max_embedding_keys": null,
1780
+ "pass_through_keys": null,
1781
+ "action_configs": [
1782
+ {
1783
+ "rep": "REL_XYZ_ROT6D",
1784
+ "type": "EEF",
1785
+ "format": "XYZ_ROT6D",
1786
+ "state_key": "psm1_pose",
1787
+ "input_rotation_format": "quat",
1788
+ "input_quat_order": "xyzw",
1789
+ "reference_rotation_format": "quat",
1790
+ "reference_quat_order": "xyzw",
1791
+ "translation_scaling_key": null,
1792
+ "rotation_scaling_key": null,
1793
+ "hold_through_clutch": false,
1794
+ "normalization_type": "temporal_meanstd"
1795
+ },
1796
+ {
1797
+ "rep": "ABSOLUTE",
1798
+ "type": "NON_EEF",
1799
+ "format": "DEFAULT",
1800
+ "state_key": null,
1801
+ "input_rotation_format": "quat",
1802
+ "input_quat_order": "xyzw",
1803
+ "reference_rotation_format": "rot6d",
1804
+ "reference_quat_order": "xyzw",
1805
+ "translation_scaling_key": null,
1806
+ "rotation_scaling_key": null,
1807
+ "hold_through_clutch": false,
1808
+ "normalization_type": "temporal_meanstd"
1809
+ },
1810
+ {
1811
+ "rep": "REL_XYZ_ROT6D",
1812
+ "type": "EEF",
1813
+ "format": "XYZ_ROT6D",
1814
+ "state_key": "psm2_pose",
1815
+ "input_rotation_format": "quat",
1816
+ "input_quat_order": "xyzw",
1817
+ "reference_rotation_format": "quat",
1818
+ "reference_quat_order": "xyzw",
1819
+ "translation_scaling_key": null,
1820
+ "rotation_scaling_key": null,
1821
+ "hold_through_clutch": false,
1822
+ "normalization_type": "temporal_meanstd"
1823
+ },
1824
+ {
1825
+ "rep": "ABSOLUTE",
1826
+ "type": "NON_EEF",
1827
+ "format": "DEFAULT",
1828
+ "state_key": null,
1829
+ "input_rotation_format": "quat",
1830
+ "input_quat_order": "xyzw",
1831
+ "reference_rotation_format": "rot6d",
1832
+ "reference_quat_order": "xyzw",
1833
+ "translation_scaling_key": null,
1834
+ "rotation_scaling_key": null,
1835
+ "hold_through_clutch": false,
1836
+ "normalization_type": "temporal_meanstd"
1837
+ }
1838
+ ]
1839
+ },
1840
+ "language": {
1841
+ "delta_indices": [
1842
+ 0
1843
+ ],
1844
+ "modality_keys": [
1845
+ "annotation.human.task_description"
1846
+ ],
1847
+ "sin_cos_embedding_keys": null,
1848
+ "mean_std_embedding_keys": null,
1849
+ "min_max_embedding_keys": null,
1850
+ "pass_through_keys": null,
1851
+ "action_configs": null
1852
+ }
1853
+ },
1854
+ "rob_surgical_bitrack": {
1855
+ "video": {
1856
+ "delta_indices": [
1857
+ 0
1858
+ ],
1859
+ "modality_keys": [
1860
+ "endoscope"
1861
+ ],
1862
+ "sin_cos_embedding_keys": null,
1863
+ "mean_std_embedding_keys": null,
1864
+ "min_max_embedding_keys": null,
1865
+ "pass_through_keys": null,
1866
+ "action_configs": null
1867
+ },
1868
+ "state": {
1869
+ "delta_indices": [
1870
+ 0
1871
+ ],
1872
+ "modality_keys": [
1873
+ "left_pose",
1874
+ "right_pose",
1875
+ "aux_pose"
1876
+ ],
1877
+ "sin_cos_embedding_keys": null,
1878
+ "mean_std_embedding_keys": [
1879
+ "left_pose",
1880
+ "right_pose",
1881
+ "aux_pose"
1882
+ ],
1883
+ "min_max_embedding_keys": null,
1884
+ "pass_through_keys": null,
1885
+ "action_configs": null
1886
+ },
1887
+ "action": {
1888
+ "delta_indices": [
1889
+ 0,
1890
+ 1,
1891
+ 2,
1892
+ 3,
1893
+ 4,
1894
+ 5,
1895
+ 6,
1896
+ 7,
1897
+ 8,
1898
+ 9,
1899
+ 10,
1900
+ 11,
1901
+ 12,
1902
+ 13,
1903
+ 14,
1904
+ 15,
1905
+ 16,
1906
+ 17,
1907
+ 18,
1908
+ 19,
1909
+ 20,
1910
+ 21,
1911
+ 22,
1912
+ 23,
1913
+ 24,
1914
+ 25,
1915
+ 26,
1916
+ 27,
1917
+ 28,
1918
+ 29,
1919
+ 30,
1920
+ 31,
1921
+ 32,
1922
+ 33,
1923
+ 34,
1924
+ 35,
1925
+ 36,
1926
+ 37,
1927
+ 38,
1928
+ 39,
1929
+ 40,
1930
+ 41,
1931
+ 42,
1932
+ 43,
1933
+ 44,
1934
+ 45,
1935
+ 46,
1936
+ 47,
1937
+ 48,
1938
+ 49
1939
+ ],
1940
+ "modality_keys": [
1941
+ "left_pose",
1942
+ "right_pose",
1943
+ "aux_pose"
1944
+ ],
1945
+ "sin_cos_embedding_keys": null,
1946
+ "mean_std_embedding_keys": null,
1947
+ "min_max_embedding_keys": null,
1948
+ "pass_through_keys": null,
1949
+ "action_configs": [
1950
+ {
1951
+ "rep": "REL_XYZ_ROT6D",
1952
+ "type": "EEF",
1953
+ "format": "XYZ_ROT6D",
1954
+ "state_key": "left_pose",
1955
+ "input_rotation_format": "euler",
1956
+ "input_quat_order": "xyzw",
1957
+ "reference_rotation_format": "euler",
1958
+ "reference_quat_order": "xyzw",
1959
+ "translation_scaling_key": null,
1960
+ "rotation_scaling_key": null,
1961
+ "hold_through_clutch": false,
1962
+ "normalization_type": "percentile"
1963
+ },
1964
+ {
1965
+ "rep": "REL_XYZ_ROT6D",
1966
+ "type": "EEF",
1967
+ "format": "XYZ_ROT6D",
1968
+ "state_key": "right_pose",
1969
+ "input_rotation_format": "euler",
1970
+ "input_quat_order": "xyzw",
1971
+ "reference_rotation_format": "euler",
1972
+ "reference_quat_order": "xyzw",
1973
+ "translation_scaling_key": null,
1974
+ "rotation_scaling_key": null,
1975
+ "hold_through_clutch": false,
1976
+ "normalization_type": "percentile"
1977
+ },
1978
+ {
1979
+ "rep": "REL_XYZ_ROT6D",
1980
+ "type": "EEF",
1981
+ "format": "XYZ_ROT6D",
1982
+ "state_key": "aux_pose",
1983
+ "input_rotation_format": "euler",
1984
+ "input_quat_order": "xyzw",
1985
+ "reference_rotation_format": "euler",
1986
+ "reference_quat_order": "xyzw",
1987
+ "translation_scaling_key": null,
1988
+ "rotation_scaling_key": null,
1989
+ "hold_through_clutch": false,
1990
+ "normalization_type": "percentile"
1991
+ }
1992
+ ]
1993
+ },
1994
+ "language": {
1995
+ "delta_indices": [
1996
+ 0
1997
+ ],
1998
+ "modality_keys": [
1999
+ "annotation.instruction"
2000
+ ],
2001
+ "sin_cos_embedding_keys": null,
2002
+ "mean_std_embedding_keys": null,
2003
+ "min_max_embedding_keys": null,
2004
+ "pass_through_keys": null,
2005
+ "action_configs": null
2006
+ }
2007
+ },
2008
+ "turin_mitic_ex_vivo": {
2009
+ "video": {
2010
+ "delta_indices": [
2011
+ 0
2012
+ ],
2013
+ "modality_keys": [
2014
+ "endoscope_left"
2015
+ ],
2016
+ "sin_cos_embedding_keys": null,
2017
+ "mean_std_embedding_keys": null,
2018
+ "min_max_embedding_keys": null,
2019
+ "pass_through_keys": null,
2020
+ "action_configs": null
2021
+ },
2022
+ "state": {
2023
+ "delta_indices": [
2024
+ 0
2025
+ ],
2026
+ "modality_keys": [
2027
+ "psm1_joints",
2028
+ "psm2_joints",
2029
+ "psm1_pose",
2030
+ "psm2_pose"
2031
+ ],
2032
+ "sin_cos_embedding_keys": null,
2033
+ "mean_std_embedding_keys": [
2034
+ "psm1_joints",
2035
+ "psm2_joints"
2036
+ ],
2037
+ "min_max_embedding_keys": null,
2038
+ "pass_through_keys": [
2039
+ "psm1_pose",
2040
+ "psm2_pose"
2041
+ ],
2042
+ "action_configs": null
2043
+ },
2044
+ "action": {
2045
+ "delta_indices": [
2046
+ 1,
2047
+ 2,
2048
+ 3,
2049
+ 4,
2050
+ 5,
2051
+ 6,
2052
+ 7,
2053
+ 8,
2054
+ 9,
2055
+ 10,
2056
+ 11,
2057
+ 12,
2058
+ 13,
2059
+ 14,
2060
+ 15,
2061
+ 16,
2062
+ 17,
2063
+ 18,
2064
+ 19,
2065
+ 20,
2066
+ 21,
2067
+ 22,
2068
+ 23,
2069
+ 24,
2070
+ 25,
2071
+ 26,
2072
+ 27,
2073
+ 28,
2074
+ 29,
2075
+ 30,
2076
+ 31,
2077
+ 32,
2078
+ 33,
2079
+ 34,
2080
+ 35,
2081
+ 36,
2082
+ 37,
2083
+ 38,
2084
+ 39,
2085
+ 40,
2086
+ 41,
2087
+ 42,
2088
+ 43,
2089
+ 44,
2090
+ 45,
2091
+ 46,
2092
+ 47,
2093
+ 48,
2094
+ 49,
2095
+ 50
2096
+ ],
2097
+ "modality_keys": [
2098
+ "psm1_pose",
2099
+ "psm2_pose"
2100
+ ],
2101
+ "sin_cos_embedding_keys": null,
2102
+ "mean_std_embedding_keys": null,
2103
+ "min_max_embedding_keys": null,
2104
+ "pass_through_keys": null,
2105
+ "action_configs": [
2106
+ {
2107
+ "rep": "REL_XYZ_ROT6D",
2108
+ "type": "EEF",
2109
+ "format": "XYZ_ROT6D",
2110
+ "state_key": "psm1_pose",
2111
+ "input_rotation_format": "quat",
2112
+ "input_quat_order": "xyzw",
2113
+ "reference_rotation_format": "quat",
2114
+ "reference_quat_order": "xyzw",
2115
+ "translation_scaling_key": null,
2116
+ "rotation_scaling_key": null,
2117
+ "hold_through_clutch": false,
2118
+ "normalization_type": "percentile"
2119
+ },
2120
+ {
2121
+ "rep": "REL_XYZ_ROT6D",
2122
+ "type": "EEF",
2123
+ "format": "XYZ_ROT6D",
2124
+ "state_key": "psm2_pose",
2125
+ "input_rotation_format": "quat",
2126
+ "input_quat_order": "xyzw",
2127
+ "reference_rotation_format": "quat",
2128
+ "reference_quat_order": "xyzw",
2129
+ "translation_scaling_key": null,
2130
+ "rotation_scaling_key": null,
2131
+ "hold_through_clutch": false,
2132
+ "normalization_type": "percentile"
2133
+ }
2134
+ ]
2135
+ },
2136
+ "language": {
2137
+ "delta_indices": [
2138
+ 0
2139
+ ],
2140
+ "modality_keys": [
2141
+ "annotation.instruction"
2142
+ ],
2143
+ "sin_cos_embedding_keys": null,
2144
+ "mean_std_embedding_keys": null,
2145
+ "min_max_embedding_keys": null,
2146
+ "pass_through_keys": null,
2147
+ "action_configs": null
2148
+ }
2149
+ },
2150
+ "ustc_torin_tuodao": {
2151
+ "video": {
2152
+ "delta_indices": [
2153
+ 0
2154
+ ],
2155
+ "modality_keys": [
2156
+ "endoscope_left"
2157
+ ],
2158
+ "sin_cos_embedding_keys": null,
2159
+ "mean_std_embedding_keys": null,
2160
+ "min_max_embedding_keys": null,
2161
+ "pass_through_keys": null,
2162
+ "action_configs": null
2163
+ },
2164
+ "state": {
2165
+ "delta_indices": [
2166
+ 0
2167
+ ],
2168
+ "modality_keys": [
2169
+ "left_joints",
2170
+ "right_joints",
2171
+ "left_pose",
2172
+ "right_pose"
2173
+ ],
2174
+ "sin_cos_embedding_keys": null,
2175
+ "mean_std_embedding_keys": [
2176
+ "left_joints",
2177
+ "right_joints"
2178
+ ],
2179
+ "min_max_embedding_keys": null,
2180
+ "pass_through_keys": [
2181
+ "left_pose",
2182
+ "right_pose"
2183
+ ],
2184
+ "action_configs": null
2185
+ },
2186
+ "action": {
2187
+ "delta_indices": [
2188
+ 0,
2189
+ 1,
2190
+ 2,
2191
+ 3,
2192
+ 4,
2193
+ 5,
2194
+ 6,
2195
+ 7,
2196
+ 8,
2197
+ 9,
2198
+ 10,
2199
+ 11,
2200
+ 12,
2201
+ 13,
2202
+ 14,
2203
+ 15,
2204
+ 16,
2205
+ 17,
2206
+ 18,
2207
+ 19,
2208
+ 20,
2209
+ 21,
2210
+ 22,
2211
+ 23,
2212
+ 24,
2213
+ 25,
2214
+ 26,
2215
+ 27,
2216
+ 28,
2217
+ 29,
2218
+ 30,
2219
+ 31,
2220
+ 32,
2221
+ 33,
2222
+ 34,
2223
+ 35,
2224
+ 36,
2225
+ 37,
2226
+ 38,
2227
+ 39,
2228
+ 40,
2229
+ 41,
2230
+ 42,
2231
+ 43,
2232
+ 44,
2233
+ 45,
2234
+ 46,
2235
+ 47,
2236
+ 48,
2237
+ 49
2238
+ ],
2239
+ "modality_keys": [
2240
+ "left_pose",
2241
+ "left_gripper",
2242
+ "right_pose",
2243
+ "right_gripper"
2244
+ ],
2245
+ "sin_cos_embedding_keys": null,
2246
+ "mean_std_embedding_keys": null,
2247
+ "min_max_embedding_keys": null,
2248
+ "pass_through_keys": null,
2249
+ "action_configs": [
2250
+ {
2251
+ "rep": "REL_XYZ_ROT6D",
2252
+ "type": "EEF",
2253
+ "format": "XYZ_ROT6D",
2254
+ "state_key": "left_pose",
2255
+ "input_rotation_format": "quat",
2256
+ "input_quat_order": "xyzw",
2257
+ "reference_rotation_format": "quat",
2258
+ "reference_quat_order": "xyzw",
2259
+ "translation_scaling_key": null,
2260
+ "rotation_scaling_key": null,
2261
+ "hold_through_clutch": false,
2262
+ "normalization_type": "percentile"
2263
+ },
2264
+ {
2265
+ "rep": "ABSOLUTE",
2266
+ "type": "NON_EEF",
2267
+ "format": "DEFAULT",
2268
+ "state_key": null,
2269
+ "input_rotation_format": "quat",
2270
+ "input_quat_order": "xyzw",
2271
+ "reference_rotation_format": "rot6d",
2272
+ "reference_quat_order": "xyzw",
2273
+ "translation_scaling_key": null,
2274
+ "rotation_scaling_key": null,
2275
+ "hold_through_clutch": false,
2276
+ "normalization_type": "percentile"
2277
+ },
2278
+ {
2279
+ "rep": "REL_XYZ_ROT6D",
2280
+ "type": "EEF",
2281
+ "format": "XYZ_ROT6D",
2282
+ "state_key": "right_pose",
2283
+ "input_rotation_format": "quat",
2284
+ "input_quat_order": "xyzw",
2285
+ "reference_rotation_format": "quat",
2286
+ "reference_quat_order": "xyzw",
2287
+ "translation_scaling_key": null,
2288
+ "rotation_scaling_key": null,
2289
+ "hold_through_clutch": false,
2290
+ "normalization_type": "percentile"
2291
+ },
2292
+ {
2293
+ "rep": "ABSOLUTE",
2294
+ "type": "NON_EEF",
2295
+ "format": "DEFAULT",
2296
+ "state_key": null,
2297
+ "input_rotation_format": "quat",
2298
+ "input_quat_order": "xyzw",
2299
+ "reference_rotation_format": "rot6d",
2300
+ "reference_quat_order": "xyzw",
2301
+ "translation_scaling_key": null,
2302
+ "rotation_scaling_key": null,
2303
+ "hold_through_clutch": false,
2304
+ "normalization_type": "percentile"
2305
+ }
2306
+ ]
2307
+ },
2308
+ "language": {
2309
+ "delta_indices": [
2310
+ 0
2311
+ ],
2312
+ "modality_keys": [
2313
+ "annotation.instruction"
2314
+ ],
2315
+ "sin_cos_embedding_keys": null,
2316
+ "mean_std_embedding_keys": null,
2317
+ "min_max_embedding_keys": null,
2318
+ "pass_through_keys": null,
2319
+ "action_configs": null
2320
+ }
2321
+ },
2322
+ "hamlyn_dvrk_30hz": {
2323
+ "video": {
2324
+ "delta_indices": [
2325
+ 0
2326
+ ],
2327
+ "modality_keys": [
2328
+ "endoscope",
2329
+ "wrist_left",
2330
+ "wrist_right"
2331
+ ],
2332
+ "sin_cos_embedding_keys": null,
2333
+ "mean_std_embedding_keys": null,
2334
+ "min_max_embedding_keys": null,
2335
+ "pass_through_keys": null,
2336
+ "action_configs": null
2337
+ },
2338
+ "state": {
2339
+ "delta_indices": [
2340
+ 0
2341
+ ],
2342
+ "modality_keys": [
2343
+ "left_arm_pose",
2344
+ "left_arm_gripper",
2345
+ "right_arm_pose",
2346
+ "right_arm_gripper"
2347
+ ],
2348
+ "sin_cos_embedding_keys": null,
2349
+ "mean_std_embedding_keys": [
2350
+ "left_arm_pose",
2351
+ "left_arm_gripper",
2352
+ "right_arm_pose",
2353
+ "right_arm_gripper"
2354
+ ],
2355
+ "min_max_embedding_keys": null,
2356
+ "pass_through_keys": null,
2357
+ "action_configs": null
2358
+ },
2359
+ "action": {
2360
+ "delta_indices": [
2361
+ 0,
2362
+ 1,
2363
+ 2,
2364
+ 3,
2365
+ 4,
2366
+ 5,
2367
+ 6,
2368
+ 7,
2369
+ 8,
2370
+ 9,
2371
+ 10,
2372
+ 11,
2373
+ 12,
2374
+ 13,
2375
+ 14,
2376
+ 15,
2377
+ 16,
2378
+ 17,
2379
+ 18,
2380
+ 19,
2381
+ 20,
2382
+ 21,
2383
+ 22,
2384
+ 23,
2385
+ 24,
2386
+ 25,
2387
+ 26,
2388
+ 27,
2389
+ 28,
2390
+ 29,
2391
+ 30,
2392
+ 31,
2393
+ 32,
2394
+ 33,
2395
+ 34,
2396
+ 35,
2397
+ 36,
2398
+ 37,
2399
+ 38,
2400
+ 39,
2401
+ 40,
2402
+ 41,
2403
+ 42,
2404
+ 43,
2405
+ 44,
2406
+ 45,
2407
+ 46,
2408
+ 47,
2409
+ 48,
2410
+ 49
2411
+ ],
2412
+ "modality_keys": [
2413
+ "left_arm_pose",
2414
+ "left_arm_gripper",
2415
+ "right_arm_pose",
2416
+ "right_arm_gripper"
2417
+ ],
2418
+ "sin_cos_embedding_keys": null,
2419
+ "mean_std_embedding_keys": null,
2420
+ "min_max_embedding_keys": null,
2421
+ "pass_through_keys": null,
2422
+ "action_configs": [
2423
+ {
2424
+ "rep": "REL_XYZ_ROT6D",
2425
+ "type": "EEF",
2426
+ "format": "XYZ_ROT6D",
2427
+ "state_key": "left_arm_pose",
2428
+ "input_rotation_format": "quat",
2429
+ "input_quat_order": "wxyz",
2430
+ "reference_rotation_format": "quat",
2431
+ "reference_quat_order": "wxyz",
2432
+ "translation_scaling_key": null,
2433
+ "rotation_scaling_key": null,
2434
+ "hold_through_clutch": false,
2435
+ "normalization_type": "percentile"
2436
+ },
2437
+ {
2438
+ "rep": "ABSOLUTE",
2439
+ "type": "NON_EEF",
2440
+ "format": "DEFAULT",
2441
+ "state_key": null,
2442
+ "input_rotation_format": "quat",
2443
+ "input_quat_order": "xyzw",
2444
+ "reference_rotation_format": "rot6d",
2445
+ "reference_quat_order": "xyzw",
2446
+ "translation_scaling_key": null,
2447
+ "rotation_scaling_key": null,
2448
+ "hold_through_clutch": false,
2449
+ "normalization_type": "percentile"
2450
+ },
2451
+ {
2452
+ "rep": "REL_XYZ_ROT6D",
2453
+ "type": "EEF",
2454
+ "format": "XYZ_ROT6D",
2455
+ "state_key": "right_arm_pose",
2456
+ "input_rotation_format": "quat",
2457
+ "input_quat_order": "wxyz",
2458
+ "reference_rotation_format": "quat",
2459
+ "reference_quat_order": "wxyz",
2460
+ "translation_scaling_key": null,
2461
+ "rotation_scaling_key": null,
2462
+ "hold_through_clutch": false,
2463
+ "normalization_type": "percentile"
2464
+ },
2465
+ {
2466
+ "rep": "ABSOLUTE",
2467
+ "type": "NON_EEF",
2468
+ "format": "DEFAULT",
2469
+ "state_key": null,
2470
+ "input_rotation_format": "quat",
2471
+ "input_quat_order": "xyzw",
2472
+ "reference_rotation_format": "rot6d",
2473
+ "reference_quat_order": "xyzw",
2474
+ "translation_scaling_key": null,
2475
+ "rotation_scaling_key": null,
2476
+ "hold_through_clutch": false,
2477
+ "normalization_type": "percentile"
2478
+ }
2479
+ ]
2480
+ },
2481
+ "language": {
2482
+ "delta_indices": [
2483
+ 0
2484
+ ],
2485
+ "modality_keys": [
2486
+ "task"
2487
+ ],
2488
+ "sin_cos_embedding_keys": null,
2489
+ "mean_std_embedding_keys": null,
2490
+ "min_max_embedding_keys": null,
2491
+ "pass_through_keys": null,
2492
+ "action_configs": null
2493
+ }
2494
+ },
2495
+ "ucb_dvrk": {
2496
+ "video": {
2497
+ "delta_indices": [
2498
+ 0
2499
+ ],
2500
+ "modality_keys": [
2501
+ "camera_left"
2502
+ ],
2503
+ "sin_cos_embedding_keys": null,
2504
+ "mean_std_embedding_keys": null,
2505
+ "min_max_embedding_keys": null,
2506
+ "pass_through_keys": null,
2507
+ "action_configs": null
2508
+ },
2509
+ "state": {
2510
+ "delta_indices": [
2511
+ 0
2512
+ ],
2513
+ "modality_keys": [
2514
+ "psm1_joints",
2515
+ "psm1_gripper",
2516
+ "psm2_joints",
2517
+ "psm2_gripper",
2518
+ "psm1_pose",
2519
+ "psm2_pose"
2520
+ ],
2521
+ "sin_cos_embedding_keys": null,
2522
+ "mean_std_embedding_keys": [
2523
+ "psm1_joints",
2524
+ "psm1_gripper",
2525
+ "psm2_joints",
2526
+ "psm2_gripper"
2527
+ ],
2528
+ "min_max_embedding_keys": null,
2529
+ "pass_through_keys": [
2530
+ "psm1_pose",
2531
+ "psm2_pose"
2532
+ ],
2533
+ "action_configs": null
2534
+ },
2535
+ "action": {
2536
+ "delta_indices": [
2537
+ 0,
2538
+ 1,
2539
+ 2,
2540
+ 3,
2541
+ 4,
2542
+ 5,
2543
+ 6,
2544
+ 7,
2545
+ 8,
2546
+ 9,
2547
+ 10,
2548
+ 11,
2549
+ 12,
2550
+ 13,
2551
+ 14,
2552
+ 15,
2553
+ 16,
2554
+ 17,
2555
+ 18,
2556
+ 19,
2557
+ 20,
2558
+ 21,
2559
+ 22,
2560
+ 23,
2561
+ 24,
2562
+ 25,
2563
+ 26,
2564
+ 27,
2565
+ 28,
2566
+ 29,
2567
+ 30,
2568
+ 31,
2569
+ 32,
2570
+ 33,
2571
+ 34,
2572
+ 35,
2573
+ 36,
2574
+ 37,
2575
+ 38,
2576
+ 39,
2577
+ 40,
2578
+ 41,
2579
+ 42,
2580
+ 43,
2581
+ 44,
2582
+ 45,
2583
+ 46,
2584
+ 47,
2585
+ 48,
2586
+ 49
2587
+ ],
2588
+ "modality_keys": [
2589
+ "psm1_pose",
2590
+ "psm1_gripper",
2591
+ "psm2_pose",
2592
+ "psm2_gripper"
2593
+ ],
2594
+ "sin_cos_embedding_keys": null,
2595
+ "mean_std_embedding_keys": null,
2596
+ "min_max_embedding_keys": null,
2597
+ "pass_through_keys": null,
2598
+ "action_configs": [
2599
+ {
2600
+ "rep": "REL_XYZ_ROT6D",
2601
+ "type": "EEF",
2602
+ "format": "XYZ_ROT6D",
2603
+ "state_key": "psm1_pose",
2604
+ "input_rotation_format": "quat",
2605
+ "input_quat_order": "xyzw",
2606
+ "reference_rotation_format": "quat",
2607
+ "reference_quat_order": "xyzw",
2608
+ "translation_scaling_key": null,
2609
+ "rotation_scaling_key": null,
2610
+ "hold_through_clutch": false,
2611
+ "normalization_type": "percentile"
2612
+ },
2613
+ {
2614
+ "rep": "ABSOLUTE",
2615
+ "type": "NON_EEF",
2616
+ "format": "DEFAULT",
2617
+ "state_key": null,
2618
+ "input_rotation_format": "quat",
2619
+ "input_quat_order": "xyzw",
2620
+ "reference_rotation_format": "rot6d",
2621
+ "reference_quat_order": "xyzw",
2622
+ "translation_scaling_key": null,
2623
+ "rotation_scaling_key": null,
2624
+ "hold_through_clutch": false,
2625
+ "normalization_type": "percentile"
2626
+ },
2627
+ {
2628
+ "rep": "REL_XYZ_ROT6D",
2629
+ "type": "EEF",
2630
+ "format": "XYZ_ROT6D",
2631
+ "state_key": "psm2_pose",
2632
+ "input_rotation_format": "quat",
2633
+ "input_quat_order": "xyzw",
2634
+ "reference_rotation_format": "quat",
2635
+ "reference_quat_order": "xyzw",
2636
+ "translation_scaling_key": null,
2637
+ "rotation_scaling_key": null,
2638
+ "hold_through_clutch": false,
2639
+ "normalization_type": "percentile"
2640
+ },
2641
+ {
2642
+ "rep": "ABSOLUTE",
2643
+ "type": "NON_EEF",
2644
+ "format": "DEFAULT",
2645
+ "state_key": null,
2646
+ "input_rotation_format": "quat",
2647
+ "input_quat_order": "xyzw",
2648
+ "reference_rotation_format": "rot6d",
2649
+ "reference_quat_order": "xyzw",
2650
+ "translation_scaling_key": null,
2651
+ "rotation_scaling_key": null,
2652
+ "hold_through_clutch": false,
2653
+ "normalization_type": "percentile"
2654
+ }
2655
+ ]
2656
+ },
2657
+ "language": {
2658
+ "delta_indices": [
2659
+ 0
2660
+ ],
2661
+ "modality_keys": [
2662
+ "task"
2663
+ ],
2664
+ "sin_cos_embedding_keys": null,
2665
+ "mean_std_embedding_keys": null,
2666
+ "min_max_embedding_keys": null,
2667
+ "pass_through_keys": null,
2668
+ "action_configs": null
2669
+ }
2670
+ },
2671
+ "jhu_imerse_star_il": {
2672
+ "video": {
2673
+ "delta_indices": [
2674
+ 0
2675
+ ],
2676
+ "modality_keys": [
2677
+ "endoscope_left",
2678
+ "wrist_left"
2679
+ ],
2680
+ "sin_cos_embedding_keys": null,
2681
+ "mean_std_embedding_keys": null,
2682
+ "min_max_embedding_keys": null,
2683
+ "pass_through_keys": null,
2684
+ "action_configs": null
2685
+ },
2686
+ "state": {
2687
+ "delta_indices": [
2688
+ 0
2689
+ ],
2690
+ "modality_keys": [
2691
+ "kuka_joint_pos",
2692
+ "endo360_joint_pos",
2693
+ "kuka_pose"
2694
+ ],
2695
+ "sin_cos_embedding_keys": null,
2696
+ "mean_std_embedding_keys": [
2697
+ "kuka_joint_pos",
2698
+ "endo360_joint_pos"
2699
+ ],
2700
+ "min_max_embedding_keys": null,
2701
+ "pass_through_keys": [
2702
+ "kuka_pose"
2703
+ ],
2704
+ "action_configs": null
2705
+ },
2706
+ "action": {
2707
+ "delta_indices": [
2708
+ 1,
2709
+ 2,
2710
+ 3,
2711
+ 4,
2712
+ 5,
2713
+ 6,
2714
+ 7,
2715
+ 8,
2716
+ 9,
2717
+ 10,
2718
+ 11,
2719
+ 12,
2720
+ 13,
2721
+ 14,
2722
+ 15,
2723
+ 16,
2724
+ 17,
2725
+ 18,
2726
+ 19,
2727
+ 20,
2728
+ 21,
2729
+ 22,
2730
+ 23,
2731
+ 24,
2732
+ 25,
2733
+ 26,
2734
+ 27,
2735
+ 28,
2736
+ 29,
2737
+ 30,
2738
+ 31,
2739
+ 32,
2740
+ 33,
2741
+ 34,
2742
+ 35,
2743
+ 36,
2744
+ 37,
2745
+ 38,
2746
+ 39,
2747
+ 40,
2748
+ 41,
2749
+ 42,
2750
+ 43,
2751
+ 44,
2752
+ 45,
2753
+ 46,
2754
+ 47,
2755
+ 48,
2756
+ 49,
2757
+ 50
2758
+ ],
2759
+ "modality_keys": [
2760
+ "kuka_pose"
2761
+ ],
2762
+ "sin_cos_embedding_keys": null,
2763
+ "mean_std_embedding_keys": null,
2764
+ "min_max_embedding_keys": null,
2765
+ "pass_through_keys": null,
2766
+ "action_configs": [
2767
+ {
2768
+ "rep": "REL_XYZ_ROT6D",
2769
+ "type": "EEF",
2770
+ "format": "XYZ_ROT6D",
2771
+ "state_key": "kuka_pose",
2772
+ "input_rotation_format": "quat",
2773
+ "input_quat_order": "xyzw",
2774
+ "reference_rotation_format": "quat",
2775
+ "reference_quat_order": "xyzw",
2776
+ "translation_scaling_key": null,
2777
+ "rotation_scaling_key": null,
2778
+ "hold_through_clutch": false,
2779
+ "normalization_type": "percentile"
2780
+ }
2781
+ ]
2782
+ },
2783
+ "language": {
2784
+ "delta_indices": [
2785
+ 0
2786
+ ],
2787
+ "modality_keys": [
2788
+ "annotation.human.task_description"
2789
+ ],
2790
+ "sin_cos_embedding_keys": null,
2791
+ "mean_std_embedding_keys": null,
2792
+ "min_max_embedding_keys": null,
2793
+ "pass_through_keys": null,
2794
+ "action_configs": null
2795
+ }
2796
+ }
2797
+ },
2798
+ "image_crop_size": null,
2799
+ "image_target_size": null,
2800
+ "use_albumentations": true,
2801
+ "random_rotation_angle": null,
2802
+ "color_jitter_params": null,
2803
+ "shortest_image_edge": 256,
2804
+ "crop_fraction": 0.95,
2805
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
2806
+ "model_type": "eagle",
2807
+ "formalize_language": true,
2808
+ "max_state_dim": 128,
2809
+ "max_action_dim": 128,
2810
+ "max_action_horizon": 50,
2811
+ "use_percentiles": false,
2812
+ "clip_outliers": true,
2813
+ "apply_sincos_state_encoding": true,
2814
+ "use_relative_action": true
2815
+ }
2816
+ }
checkpoint-80000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e21f683c1d875028eb5aa5a13b664850cfafc375ba0f669ade9f7de305e05b47
3
+ size 14645
checkpoint-80000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95f04fade68ac8bca3e0e3a6410cecfeb6954ecb64402667bc7c4ea9d9c53c6d
3
+ size 1465
checkpoint-80000/statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-80000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-80000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46d07ebf9c23abaad7dedb34e5f24bcf194b2e63cd9836ca05f6a3b638164d87
3
+ size 5841
checkpoint-80000/wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d6", "run_id": "groot_finetune_v2"}
checkpoint-85000/config.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": true,
5
+ "architectures": [
6
+ "Gr00tN1d6"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_model_type": "eagle",
12
+ "backbone_trainable_params_fp32": true,
13
+ "collator_overwrite_image_inputs": false,
14
+ "color_jitter_params": {
15
+ "brightness": 0.1,
16
+ "contrast": 0.1,
17
+ "hue": 0.1,
18
+ "saturation": 0.1
19
+ },
20
+ "crop_fraction": 0.95,
21
+ "diffusion_model_cfg": {
22
+ "attention_head_dim": 48,
23
+ "dropout": 0.2,
24
+ "final_dropout": true,
25
+ "interleave_self_attention": true,
26
+ "norm_type": "ada_norm",
27
+ "num_attention_heads": 32,
28
+ "num_layers": 32,
29
+ "output_dim": 1024,
30
+ "positional_embeddings": null
31
+ },
32
+ "eagle_collator": true,
33
+ "formalize_language": true,
34
+ "gemma_collator": false,
35
+ "hidden_size": 1024,
36
+ "image_crop_size": null,
37
+ "image_target_size": null,
38
+ "input_embedding_dim": 1536,
39
+ "load_bf16": true,
40
+ "max_action_dim": 128,
41
+ "max_num_embodiments": 32,
42
+ "max_seq_len": 1024,
43
+ "max_state_dim": 128,
44
+ "model_dtype": "bfloat16",
45
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
+ "model_type": "Gr00tN1d6",
47
+ "noise_beta_alpha": 1.5,
48
+ "noise_beta_beta": 1.0,
49
+ "noise_s": 0.999,
50
+ "num_inference_timesteps": 4,
51
+ "num_timestep_buckets": 1000,
52
+ "random_rotation_angle": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 16,
55
+ "shortest_image_edge": 256,
56
+ "soft_prompt_lr_scale": 1.0,
57
+ "soft_prompt_num_tokens": 32,
58
+ "state_dropout_prob": 0.0,
59
+ "state_dropout_prob_per_embodiment": {
60
+ "cmr_versius": 1.0,
61
+ "hamlyn_dvrk_30hz": 1.0,
62
+ "jhu_imerse_dvrk": 1.0,
63
+ "jhu_imerse_dvrk_mono": 1.0,
64
+ "jhu_imerse_star_il": 1.0,
65
+ "jhu_lscr_dvrk_smarts": 1.0,
66
+ "obuda_dvrk": 1.0,
67
+ "rob_surgical_bitrack": 1.0,
68
+ "stanford_dvrk_real": 1.0,
69
+ "tud_tundra_ur5e": 1.0,
70
+ "turin_mitic_ex_vivo": 1.0,
71
+ "ucb_dvrk": 1.0,
72
+ "ucsd_dvrk": 1.0,
73
+ "ustc_torin_tuodao": 1.0
74
+ },
75
+ "torch_dtype": "bfloat16",
76
+ "transformers_version": "4.51.3",
77
+ "tune_diffusion_model": true,
78
+ "tune_llm": false,
79
+ "tune_projector": true,
80
+ "tune_top_llm_layers": 4,
81
+ "tune_visual": false,
82
+ "tune_vlln": true,
83
+ "use_albumentations_transforms": true,
84
+ "use_alternate_vl_dit": true,
85
+ "use_flash_attention": true,
86
+ "use_relative_action": true,
87
+ "use_soft_prompts": false,
88
+ "use_vlln": true
89
+ }
checkpoint-85000/embodiment_id.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "new_embodiment": 10,
10
+ "jhu_imerse_dvrk": 3,
11
+ "cmr_versius": 4,
12
+ "ucb_dvrk": 5,
13
+ "sanoscience_sim": 6,
14
+ "tum_sonata_franka": 7,
15
+ "hamlyn_dvrk_15hz": 9,
16
+ "hamlyn_dvrk_30hz": 11,
17
+ "ustc_torin_tuodao": 12,
18
+ "ucsd_dvrk": 14,
19
+ "jhu_imerse_dvrk_mono": 15,
20
+ "rob_surgical_bitrack": 16,
21
+ "stanford_dvrk_real": 17,
22
+ "obuda_dvrk": 18,
23
+ "polyu_sim": 19,
24
+ "moon_maestro": 21,
25
+ "jhu_lscr_dvrk_miracle": 22,
26
+ "jhu_lscr_dvrk_smarts": 23,
27
+ "jhu_imerse_star_il": 27,
28
+ "tud_tundra_ur5e": 25,
29
+ "turin_mitic_ex_vivo": 26,
30
+ "oxe_droid": 29
31
+ }
checkpoint-85000/experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d6
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Eagle-Block2A-2B-v2
6
+ backbone_model_type: eagle
7
+ model_revision: null
8
+ tune_top_llm_layers: 4
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 16
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ collator_overwrite_image_inputs: false
17
+ eagle_collator: true
18
+ backbone_trainable_params_fp32: true
19
+ image_crop_size: null
20
+ image_target_size: null
21
+ shortest_image_edge: 256
22
+ crop_fraction: 0.95
23
+ random_rotation_angle: null
24
+ color_jitter_params: null
25
+ use_albumentations_transforms: true
26
+ extra_augmentation_config: null
27
+ formalize_language: true
28
+ apply_sincos_state_encoding: false
29
+ use_relative_action: true
30
+ max_state_dim: 29
31
+ max_action_dim: 29
32
+ action_horizon: 50
33
+ hidden_size: 1024
34
+ input_embedding_dim: 1536
35
+ add_pos_embed: true
36
+ attn_dropout: 0.2
37
+ use_vlln: true
38
+ max_seq_len: 1024
39
+ use_alternate_vl_dit: true
40
+ attend_text_every_n_blocks: 2
41
+ diffusion_model_cfg:
42
+ positional_embeddings: null
43
+ num_layers: 32
44
+ num_attention_heads: 32
45
+ attention_head_dim: 48
46
+ norm_type: ada_norm
47
+ dropout: 0.2
48
+ final_dropout: true
49
+ output_dim: 1024
50
+ interleave_self_attention: true
51
+ num_inference_timesteps: 4
52
+ noise_beta_alpha: 1.5
53
+ noise_beta_beta: 1.0
54
+ noise_s: 0.999
55
+ num_timestep_buckets: 1000
56
+ tune_projector: true
57
+ tune_diffusion_model: true
58
+ tune_vlln: true
59
+ state_dropout_prob: 0.0
60
+ state_dropout_prob_per_embodiment: null
61
+ state_additive_noise_scale: 0.0
62
+ max_num_embodiments: 32
63
+ data:
64
+ datasets:
65
+ - dataset_paths:
66
+ - /hkfs/work/workspace/scratch/uenyr-thesis/data/peg_transfer_lerobot
67
+ embodiment_tag: jhu_imerse_dvrk_mono
68
+ mix_ratio: 1.0
69
+ dataset_type: physical_embodiment
70
+ val_dataset_path: null
71
+ exclude_splits: null
72
+ include_splits: null
73
+ modality_configs:
74
+ jhu_imerse_dvrk_mono:
75
+ video:
76
+ delta_indices:
77
+ - 0
78
+ modality_keys:
79
+ - endoscope_left
80
+ sin_cos_embedding_keys: null
81
+ mean_std_embedding_keys: null
82
+ min_max_embedding_keys: null
83
+ pass_through_keys: null
84
+ action_configs: null
85
+ state:
86
+ delta_indices:
87
+ - 0
88
+ modality_keys:
89
+ - psm1_pose
90
+ - psm1_gripper
91
+ - psm2_pose
92
+ - psm2_gripper
93
+ sin_cos_embedding_keys: null
94
+ mean_std_embedding_keys:
95
+ - psm1_pose
96
+ - psm1_gripper
97
+ - psm2_pose
98
+ - psm2_gripper
99
+ min_max_embedding_keys: null
100
+ pass_through_keys: null
101
+ action_configs: null
102
+ action:
103
+ delta_indices:
104
+ - 0
105
+ - 1
106
+ - 2
107
+ - 3
108
+ - 4
109
+ - 5
110
+ - 6
111
+ - 7
112
+ - 8
113
+ - 9
114
+ - 10
115
+ - 11
116
+ - 12
117
+ - 13
118
+ - 14
119
+ - 15
120
+ - 16
121
+ - 17
122
+ - 18
123
+ - 19
124
+ - 20
125
+ - 21
126
+ - 22
127
+ - 23
128
+ - 24
129
+ - 25
130
+ - 26
131
+ - 27
132
+ - 28
133
+ - 29
134
+ - 30
135
+ - 31
136
+ - 32
137
+ - 33
138
+ - 34
139
+ - 35
140
+ - 36
141
+ - 37
142
+ - 38
143
+ - 39
144
+ - 40
145
+ - 41
146
+ - 42
147
+ - 43
148
+ - 44
149
+ - 45
150
+ - 46
151
+ - 47
152
+ - 48
153
+ - 49
154
+ modality_keys:
155
+ - psm1_pose
156
+ - psm1_gripper
157
+ - psm2_pose
158
+ - psm2_gripper
159
+ sin_cos_embedding_keys: null
160
+ mean_std_embedding_keys: null
161
+ min_max_embedding_keys: null
162
+ pass_through_keys: null
163
+ action_configs:
164
+ - rep: REL_XYZ_ROT6D
165
+ type: EEF
166
+ format: XYZ_ROT6D
167
+ state_key: psm1_pose
168
+ input_rotation_format: quat
169
+ input_quat_order: xyzw
170
+ reference_rotation_format: quat
171
+ reference_quat_order: xyzw
172
+ translation_scaling_key: null
173
+ rotation_scaling_key: null
174
+ hold_through_clutch: false
175
+ normalization_type: temporal_meanstd
176
+ - rep: ABSOLUTE
177
+ type: NON_EEF
178
+ format: DEFAULT
179
+ state_key: null
180
+ input_rotation_format: quat
181
+ input_quat_order: xyzw
182
+ reference_rotation_format: rot6d
183
+ reference_quat_order: xyzw
184
+ translation_scaling_key: null
185
+ rotation_scaling_key: null
186
+ hold_through_clutch: false
187
+ normalization_type: temporal_meanstd
188
+ - rep: REL_XYZ_ROT6D
189
+ type: EEF
190
+ format: XYZ_ROT6D
191
+ state_key: psm2_pose
192
+ input_rotation_format: quat
193
+ input_quat_order: xyzw
194
+ reference_rotation_format: quat
195
+ reference_quat_order: xyzw
196
+ translation_scaling_key: null
197
+ rotation_scaling_key: null
198
+ hold_through_clutch: false
199
+ normalization_type: temporal_meanstd
200
+ - rep: ABSOLUTE
201
+ type: NON_EEF
202
+ format: DEFAULT
203
+ state_key: null
204
+ input_rotation_format: quat
205
+ input_quat_order: xyzw
206
+ reference_rotation_format: rot6d
207
+ reference_quat_order: xyzw
208
+ translation_scaling_key: null
209
+ rotation_scaling_key: null
210
+ hold_through_clutch: false
211
+ normalization_type: temporal_meanstd
212
+ language:
213
+ delta_indices:
214
+ - 0
215
+ modality_keys:
216
+ - annotation.human.task_description
217
+ sin_cos_embedding_keys: null
218
+ mean_std_embedding_keys: null
219
+ min_max_embedding_keys: null
220
+ pass_through_keys: null
221
+ action_configs: null
222
+ download_cache: false
223
+ shard_size: 1024
224
+ episode_sampling_rate: 0.1
225
+ num_shards_per_epoch: 100000
226
+ override_pretraining_statistics: true
227
+ mode: single_turn
228
+ random_chop: 0.0
229
+ mock_dataset_mode: false
230
+ shuffle: true
231
+ seed: 42
232
+ multiprocessing_context: fork
233
+ allow_padding: false
234
+ subsample_ratio: 1.0
235
+ image_crop_size:
236
+ - 244
237
+ - 244
238
+ image_target_size:
239
+ - 224
240
+ - 224
241
+ video_backend: torchcodec
242
+ training:
243
+ output_dir: /hkfs/work/workspace/scratch/uenyr-thesis/outputs/groot_finetune_v2
244
+ experiment_name: null
245
+ max_steps: 100000
246
+ global_batch_size: 8
247
+ batch_size: null
248
+ gradient_accumulation_steps: 1
249
+ learning_rate: 0.0001
250
+ lr_scheduler_type: cosine
251
+ weight_decay: 1.0e-05
252
+ warmup_ratio: 0.05
253
+ warmup_steps: 0
254
+ max_grad_norm: 1.0
255
+ optim: adamw_torch
256
+ start_from_checkpoint: /hkfs/work/workspace/scratch/uenyr-thesis/pretrained/GR00T-H
257
+ tf32: true
258
+ fp16: false
259
+ bf16: true
260
+ eval_bf16: true
261
+ logging_steps: 10
262
+ save_steps: 10000
263
+ save_total_limit: 5
264
+ save_vl_model: false
265
+ upload_checkpoints: false
266
+ upload_every: 1000
267
+ upload_last_n_checkpoints: 5
268
+ max_concurrent_uploads: 2
269
+ eval_strategy: 'no'
270
+ eval_steps: 500
271
+ eval_set_split_ratio: 0.1
272
+ eval_batch_size: 2
273
+ save_best_eval_metric_name: ''
274
+ save_best_eval_metric_greater_is_better: true
275
+ deepspeed_stage: 2
276
+ gradient_checkpointing: false
277
+ transformers_trust_remote_code: true
278
+ transformers_local_files_only: false
279
+ transformers_cache_dir: null
280
+ transformers_access_token: null
281
+ use_ddp: false
282
+ ddp_bucket_cap_mb: 100
283
+ num_gpus: 1
284
+ dataloader_num_workers: 4
285
+ remove_unused_columns: false
286
+ use_wandb: false
287
+ wandb_project: finetune-gr00t-n1d6
288
+ enable_profiling: false
289
+ max_retries: 3
290
+ assert_loss_less_than: null
291
+ add_rl_callback: false
292
+ enable_open_loop_eval: false
293
+ open_loop_eval_traj_ids:
294
+ - 0
295
+ open_loop_eval_steps_per_traj: 100
296
+ open_loop_eval_plot_indices: null
297
+ max_steps: 100000
298
+ save_steps: 10000
checkpoint-85000/experiment_cfg/config.yaml ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - /hkfs/work/workspace/scratch/uenyr-thesis/data/peg_transfer_lerobot
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: jhu_imerse_dvrk_mono
10
+ exclude_splits: null
11
+ include_splits: null
12
+ mix_ratio: 1.0
13
+ val_dataset_path: null
14
+ download_cache: false
15
+ episode_sampling_rate: 0.1
16
+ image_crop_size:
17
+ - 244
18
+ - 244
19
+ image_target_size:
20
+ - 224
21
+ - 224
22
+ mock_dataset_mode: false
23
+ modality_configs:
24
+ jhu_imerse_dvrk_mono:
25
+ action: !!python/object:gr00t.data.types.ModalityConfig
26
+ action_configs:
27
+ - !!python/object:gr00t.data.types.ActionConfig
28
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
29
+ - xyz+rot6d
30
+ hold_through_clutch: false
31
+ input_quat_order: xyzw
32
+ input_rotation_format: quat
33
+ normalization_type: temporal_meanstd
34
+ reference_quat_order: xyzw
35
+ reference_rotation_format: quat
36
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
37
+ - rel_xyz_rot6d
38
+ rotation_scaling_key: null
39
+ state_key: psm1_pose
40
+ translation_scaling_key: null
41
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
42
+ - eef
43
+ - !!python/object:gr00t.data.types.ActionConfig
44
+ format: &id004 !!python/object/apply:gr00t.data.types.ActionFormat
45
+ - default
46
+ hold_through_clutch: false
47
+ input_quat_order: xyzw
48
+ input_rotation_format: quat
49
+ normalization_type: temporal_meanstd
50
+ reference_quat_order: xyzw
51
+ reference_rotation_format: rot6d
52
+ rep: &id005 !!python/object/apply:gr00t.data.types.ActionRepresentation
53
+ - absolute
54
+ rotation_scaling_key: null
55
+ state_key: null
56
+ translation_scaling_key: null
57
+ type: &id006 !!python/object/apply:gr00t.data.types.ActionType
58
+ - non_eef
59
+ - !!python/object:gr00t.data.types.ActionConfig
60
+ format: *id001
61
+ hold_through_clutch: false
62
+ input_quat_order: xyzw
63
+ input_rotation_format: quat
64
+ normalization_type: temporal_meanstd
65
+ reference_quat_order: xyzw
66
+ reference_rotation_format: quat
67
+ rep: *id002
68
+ rotation_scaling_key: null
69
+ state_key: psm2_pose
70
+ translation_scaling_key: null
71
+ type: *id003
72
+ - !!python/object:gr00t.data.types.ActionConfig
73
+ format: *id004
74
+ hold_through_clutch: false
75
+ input_quat_order: xyzw
76
+ input_rotation_format: quat
77
+ normalization_type: temporal_meanstd
78
+ reference_quat_order: xyzw
79
+ reference_rotation_format: rot6d
80
+ rep: *id005
81
+ rotation_scaling_key: null
82
+ state_key: null
83
+ translation_scaling_key: null
84
+ type: *id006
85
+ delta_indices:
86
+ - 0
87
+ - 1
88
+ - 2
89
+ - 3
90
+ - 4
91
+ - 5
92
+ - 6
93
+ - 7
94
+ - 8
95
+ - 9
96
+ - 10
97
+ - 11
98
+ - 12
99
+ - 13
100
+ - 14
101
+ - 15
102
+ - 16
103
+ - 17
104
+ - 18
105
+ - 19
106
+ - 20
107
+ - 21
108
+ - 22
109
+ - 23
110
+ - 24
111
+ - 25
112
+ - 26
113
+ - 27
114
+ - 28
115
+ - 29
116
+ - 30
117
+ - 31
118
+ - 32
119
+ - 33
120
+ - 34
121
+ - 35
122
+ - 36
123
+ - 37
124
+ - 38
125
+ - 39
126
+ - 40
127
+ - 41
128
+ - 42
129
+ - 43
130
+ - 44
131
+ - 45
132
+ - 46
133
+ - 47
134
+ - 48
135
+ - 49
136
+ mean_std_embedding_keys: null
137
+ min_max_embedding_keys: null
138
+ modality_keys:
139
+ - psm1_pose
140
+ - psm1_gripper
141
+ - psm2_pose
142
+ - psm2_gripper
143
+ pass_through_keys: null
144
+ sin_cos_embedding_keys: null
145
+ language: !!python/object:gr00t.data.types.ModalityConfig
146
+ action_configs: null
147
+ delta_indices:
148
+ - 0
149
+ mean_std_embedding_keys: null
150
+ min_max_embedding_keys: null
151
+ modality_keys:
152
+ - annotation.human.task_description
153
+ pass_through_keys: null
154
+ sin_cos_embedding_keys: null
155
+ state: !!python/object:gr00t.data.types.ModalityConfig
156
+ action_configs: null
157
+ delta_indices:
158
+ - 0
159
+ mean_std_embedding_keys:
160
+ - psm1_pose
161
+ - psm1_gripper
162
+ - psm2_pose
163
+ - psm2_gripper
164
+ min_max_embedding_keys: null
165
+ modality_keys:
166
+ - psm1_pose
167
+ - psm1_gripper
168
+ - psm2_pose
169
+ - psm2_gripper
170
+ pass_through_keys: null
171
+ sin_cos_embedding_keys: null
172
+ video: !!python/object:gr00t.data.types.ModalityConfig
173
+ action_configs: null
174
+ delta_indices:
175
+ - 0
176
+ mean_std_embedding_keys: null
177
+ min_max_embedding_keys: null
178
+ modality_keys:
179
+ - endoscope_left
180
+ pass_through_keys: null
181
+ sin_cos_embedding_keys: null
182
+ mode: single_turn
183
+ multiprocessing_context: fork
184
+ num_shards_per_epoch: 100000
185
+ override_pretraining_statistics: true
186
+ random_chop: 0.0
187
+ seed: 42
188
+ shard_size: 1024
189
+ shuffle: true
190
+ subsample_ratio: 1.0
191
+ video_backend: torchcodec
192
+ load_config_path: null
193
+ model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
194
+ _attn_implementation_autoset: false
195
+ _attn_implementation_internal: null
196
+ _commit_hash: null
197
+ _name_or_path: ''
198
+ add_cross_attention: false
199
+ architectures: null
200
+ backbone_model_type: eagle
201
+ backbone_trainable_params_fp32: true
202
+ bad_words_ids: null
203
+ begin_suppress_tokens: null
204
+ bos_token_id: null
205
+ chunk_size_feed_forward: 0
206
+ color_jitter_params: null
207
+ cross_attention_hidden_size: null
208
+ decoder_start_token_id: null
209
+ diffusion_model_cfg:
210
+ attention_head_dim: 48
211
+ dropout: 0.2
212
+ final_dropout: true
213
+ interleave_self_attention: true
214
+ norm_type: ada_norm
215
+ num_attention_heads: 32
216
+ num_layers: 32
217
+ output_dim: 1024
218
+ positional_embeddings: null
219
+ diversity_penalty: 0.0
220
+ do_sample: false
221
+ eagle_collator: true
222
+ early_stopping: false
223
+ encoder_no_repeat_ngram_size: 0
224
+ eos_token_id: null
225
+ exponential_decay_length_penalty: null
226
+ extra_augmentation_config: null
227
+ finetuning_task: null
228
+ forced_bos_token_id: null
229
+ forced_eos_token_id: null
230
+ id2label:
231
+ 0: LABEL_0
232
+ 1: LABEL_1
233
+ is_decoder: false
234
+ is_encoder_decoder: false
235
+ label2id:
236
+ LABEL_0: 0
237
+ LABEL_1: 1
238
+ length_penalty: 1.0
239
+ load_bf16: false
240
+ max_length: 20
241
+ min_length: 0
242
+ model_name: nvidia/Eagle-Block2A-2B-v2
243
+ no_repeat_ngram_size: 0
244
+ num_beam_groups: 1
245
+ num_beams: 1
246
+ num_return_sequences: 1
247
+ output_attentions: false
248
+ output_hidden_states: false
249
+ output_scores: false
250
+ pad_token_id: null
251
+ prefix: null
252
+ problem_type: null
253
+ pruned_heads: {}
254
+ random_rotation_angle: null
255
+ remove_invalid_values: false
256
+ repetition_penalty: 1.0
257
+ reproject_vision: false
258
+ return_dict: true
259
+ return_dict_in_generate: false
260
+ sep_token_id: null
261
+ state_dropout_prob: 0.0
262
+ state_dropout_prob_per_embodiment: null
263
+ suppress_tokens: null
264
+ task_specific_params: null
265
+ temperature: 1.0
266
+ tf_legacy_loss: false
267
+ tie_encoder_decoder: false
268
+ tie_word_embeddings: true
269
+ tokenizer_class: null
270
+ top_k: 50
271
+ top_p: 1.0
272
+ torch_dtype: null
273
+ torchscript: false
274
+ transformers_version: null
275
+ tune_diffusion_model: true
276
+ tune_llm: false
277
+ tune_projector: true
278
+ tune_visual: false
279
+ typical_p: 1.0
280
+ use_bfloat16: false
281
+ use_relative_action: true
282
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
283
+ add_rl_callback: false
284
+ assert_loss_less_than: null
285
+ batch_size: null
286
+ bf16: true
287
+ dataloader_num_workers: 4
288
+ ddp_bucket_cap_mb: 100
289
+ deepspeed_stage: 2
290
+ enable_open_loop_eval: false
291
+ enable_profiling: false
292
+ eval_batch_size: 2
293
+ eval_bf16: true
294
+ eval_set_split_ratio: 0.1
295
+ eval_steps: 500
296
+ eval_strategy: 'no'
297
+ experiment_name: null
298
+ fp16: false
299
+ global_batch_size: 8
300
+ gradient_accumulation_steps: 1
301
+ gradient_checkpointing: false
302
+ learning_rate: 0.0001
303
+ logging_steps: 10
304
+ lr_scheduler_type: cosine
305
+ max_concurrent_uploads: 2
306
+ max_grad_norm: 1.0
307
+ max_retries: 3
308
+ max_steps: 100000
309
+ num_gpus: 1
310
+ open_loop_eval_plot_indices: null
311
+ open_loop_eval_steps_per_traj: 100
312
+ open_loop_eval_traj_ids:
313
+ - 0
314
+ optim: adamw_torch
315
+ output_dir: /hkfs/work/workspace/scratch/uenyr-thesis/outputs/groot_finetune_v2
316
+ remove_unused_columns: false
317
+ save_best_eval_metric_greater_is_better: true
318
+ save_best_eval_metric_name: ''
319
+ save_steps: 10000
320
+ save_total_limit: 5
321
+ save_vl_model: false
322
+ start_from_checkpoint: /hkfs/work/workspace/scratch/uenyr-thesis/pretrained/GR00T-H
323
+ tf32: true
324
+ transformers_access_token: null
325
+ transformers_cache_dir: null
326
+ transformers_local_files_only: false
327
+ transformers_trust_remote_code: true
328
+ upload_checkpoints: false
329
+ upload_every: 1000
330
+ upload_last_n_checkpoints: 5
331
+ use_ddp: false
332
+ use_wandb: false
333
+ wandb_project: finetune-gr00t-n1d6
334
+ warmup_ratio: 0.05
335
+ warmup_steps: 0
336
+ weight_decay: 1.0e-05
checkpoint-85000/experiment_cfg/dataset_statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-85000/experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d6",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
+ "backbone_model_type": "eagle",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 4,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": true,
15
+ "collator_overwrite_image_inputs": false,
16
+ "eagle_collator": true,
17
+ "backbone_trainable_params_fp32": true,
18
+ "extra_augmentation_config": null,
19
+ "apply_sincos_state_encoding": true,
20
+ "use_relative_action": true,
21
+ "max_state_dim": 128,
22
+ "max_action_dim": 128,
23
+ "action_horizon": 50,
24
+ "hidden_size": 1024,
25
+ "input_embedding_dim": 1536,
26
+ "add_pos_embed": true,
27
+ "attn_dropout": 0.2,
28
+ "use_vlln": true,
29
+ "max_seq_len": 1024,
30
+ "use_alternate_vl_dit": true,
31
+ "attend_text_every_n_blocks": 2,
32
+ "diffusion_model_cfg": {
33
+ "attention_head_dim": 48,
34
+ "dropout": 0.2,
35
+ "final_dropout": true,
36
+ "interleave_self_attention": true,
37
+ "norm_type": "ada_norm",
38
+ "num_attention_heads": 32,
39
+ "num_layers": 32,
40
+ "output_dim": 1024,
41
+ "positional_embeddings": null
42
+ },
43
+ "num_inference_timesteps": 4,
44
+ "noise_beta_alpha": 1.5,
45
+ "noise_beta_beta": 1.0,
46
+ "noise_s": 0.999,
47
+ "num_timestep_buckets": 1000,
48
+ "tune_projector": true,
49
+ "tune_diffusion_model": true,
50
+ "tune_vlln": true,
51
+ "state_dropout_prob": 0.0,
52
+ "state_dropout_prob_per_embodiment": {
53
+ "cmr_versius": 1.0,
54
+ "jhu_imerse_dvrk": 1.0,
55
+ "obuda_dvrk": 1.0,
56
+ "stanford_dvrk_real": 1.0,
57
+ "ucb_dvrk": 1.0,
58
+ "ucsd_dvrk": 1.0,
59
+ "hamlyn_dvrk_30hz": 1.0,
60
+ "jhu_imerse_dvrk_mono": 1.0,
61
+ "jhu_imerse_star_il": 1.0,
62
+ "jhu_lscr_dvrk_smarts": 1.0,
63
+ "rob_surgical_bitrack": 1.0,
64
+ "tud_tundra_ur5e": 1.0,
65
+ "turin_mitic_ex_vivo": 1.0,
66
+ "ustc_torin_tuodao": 1.0
67
+ },
68
+ "state_additive_noise_scale": 0.0,
69
+ "max_num_embodiments": 32
70
+ }
checkpoint-85000/experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-85000/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3eae94697e58d5309325a41535a6578f66758f8c9159b1cb241aac76117f557
3
+ size 4990126640
checkpoint-85000/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca642a7baa572642f99963f9925e5a234fd67bc52e65ac98309d430a0ae1827d
3
+ size 4823190320
checkpoint-85000/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-85000/processor_config.json ADDED
@@ -0,0 +1,2816 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "min_max_embedding_keys": null,
18
+ "pass_through_keys": null,
19
+ "action_configs": null
20
+ },
21
+ "state": {
22
+ "delta_indices": [
23
+ 0
24
+ ],
25
+ "modality_keys": [
26
+ "robot_pos",
27
+ "robot_ori_cos",
28
+ "robot_ori_sin",
29
+ "robot_2d_ori",
30
+ "robot_2d_ori_cos",
31
+ "robot_2d_ori_sin",
32
+ "robot_lin_vel",
33
+ "robot_ang_vel",
34
+ "arm_left_qpos",
35
+ "arm_left_qpos_sin",
36
+ "arm_left_qpos_cos",
37
+ "eef_left_pos",
38
+ "eef_left_quat",
39
+ "gripper_left_qpos",
40
+ "arm_right_qpos",
41
+ "arm_right_qpos_sin",
42
+ "arm_right_qpos_cos",
43
+ "eef_right_pos",
44
+ "eef_right_quat",
45
+ "gripper_right_qpos",
46
+ "trunk_qpos"
47
+ ],
48
+ "sin_cos_embedding_keys": null,
49
+ "mean_std_embedding_keys": null,
50
+ "min_max_embedding_keys": null,
51
+ "pass_through_keys": null,
52
+ "action_configs": null
53
+ },
54
+ "action": {
55
+ "delta_indices": [
56
+ 0,
57
+ 1,
58
+ 2,
59
+ 3,
60
+ 4,
61
+ 5,
62
+ 6,
63
+ 7,
64
+ 8,
65
+ 9,
66
+ 10,
67
+ 11,
68
+ 12,
69
+ 13,
70
+ 14,
71
+ 15,
72
+ 16,
73
+ 17,
74
+ 18,
75
+ 19,
76
+ 20,
77
+ 21,
78
+ 22,
79
+ 23,
80
+ 24,
81
+ 25,
82
+ 26,
83
+ 27,
84
+ 28,
85
+ 29,
86
+ 30,
87
+ 31
88
+ ],
89
+ "modality_keys": [
90
+ "base",
91
+ "torso",
92
+ "left_arm",
93
+ "left_gripper",
94
+ "right_arm",
95
+ "right_gripper"
96
+ ],
97
+ "sin_cos_embedding_keys": null,
98
+ "mean_std_embedding_keys": null,
99
+ "min_max_embedding_keys": null,
100
+ "pass_through_keys": null,
101
+ "action_configs": [
102
+ {
103
+ "rep": "ABSOLUTE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": null,
107
+ "input_rotation_format": "quat",
108
+ "input_quat_order": "xyzw",
109
+ "reference_rotation_format": "rot6d",
110
+ "reference_quat_order": "xyzw",
111
+ "translation_scaling_key": null,
112
+ "rotation_scaling_key": null,
113
+ "hold_through_clutch": false,
114
+ "normalization_type": "percentile"
115
+ },
116
+ {
117
+ "rep": "RELATIVE",
118
+ "type": "NON_EEF",
119
+ "format": "DEFAULT",
120
+ "state_key": "trunk_qpos",
121
+ "input_rotation_format": "quat",
122
+ "input_quat_order": "xyzw",
123
+ "reference_rotation_format": "rot6d",
124
+ "reference_quat_order": "xyzw",
125
+ "translation_scaling_key": null,
126
+ "rotation_scaling_key": null,
127
+ "hold_through_clutch": false,
128
+ "normalization_type": "percentile"
129
+ },
130
+ {
131
+ "rep": "RELATIVE",
132
+ "type": "NON_EEF",
133
+ "format": "DEFAULT",
134
+ "state_key": "arm_left_qpos",
135
+ "input_rotation_format": "quat",
136
+ "input_quat_order": "xyzw",
137
+ "reference_rotation_format": "rot6d",
138
+ "reference_quat_order": "xyzw",
139
+ "translation_scaling_key": null,
140
+ "rotation_scaling_key": null,
141
+ "hold_through_clutch": false,
142
+ "normalization_type": "percentile"
143
+ },
144
+ {
145
+ "rep": "ABSOLUTE",
146
+ "type": "NON_EEF",
147
+ "format": "DEFAULT",
148
+ "state_key": null,
149
+ "input_rotation_format": "quat",
150
+ "input_quat_order": "xyzw",
151
+ "reference_rotation_format": "rot6d",
152
+ "reference_quat_order": "xyzw",
153
+ "translation_scaling_key": null,
154
+ "rotation_scaling_key": null,
155
+ "hold_through_clutch": false,
156
+ "normalization_type": "percentile"
157
+ },
158
+ {
159
+ "rep": "RELATIVE",
160
+ "type": "NON_EEF",
161
+ "format": "DEFAULT",
162
+ "state_key": "arm_right_qpos",
163
+ "input_rotation_format": "quat",
164
+ "input_quat_order": "xyzw",
165
+ "reference_rotation_format": "rot6d",
166
+ "reference_quat_order": "xyzw",
167
+ "translation_scaling_key": null,
168
+ "rotation_scaling_key": null,
169
+ "hold_through_clutch": false,
170
+ "normalization_type": "percentile"
171
+ },
172
+ {
173
+ "rep": "ABSOLUTE",
174
+ "type": "NON_EEF",
175
+ "format": "DEFAULT",
176
+ "state_key": null,
177
+ "input_rotation_format": "quat",
178
+ "input_quat_order": "xyzw",
179
+ "reference_rotation_format": "rot6d",
180
+ "reference_quat_order": "xyzw",
181
+ "translation_scaling_key": null,
182
+ "rotation_scaling_key": null,
183
+ "hold_through_clutch": false,
184
+ "normalization_type": "percentile"
185
+ }
186
+ ]
187
+ },
188
+ "language": {
189
+ "delta_indices": [
190
+ 0
191
+ ],
192
+ "modality_keys": [
193
+ "annotation.human.coarse_action"
194
+ ],
195
+ "sin_cos_embedding_keys": null,
196
+ "mean_std_embedding_keys": null,
197
+ "min_max_embedding_keys": null,
198
+ "pass_through_keys": null,
199
+ "action_configs": null
200
+ }
201
+ },
202
+ "gr1": {
203
+ "video": {
204
+ "delta_indices": [
205
+ 0
206
+ ],
207
+ "modality_keys": [
208
+ "ego_view_bg_crop_pad_res256_freq20"
209
+ ],
210
+ "sin_cos_embedding_keys": null,
211
+ "mean_std_embedding_keys": null,
212
+ "min_max_embedding_keys": null,
213
+ "pass_through_keys": null,
214
+ "action_configs": null
215
+ },
216
+ "state": {
217
+ "delta_indices": [
218
+ 0
219
+ ],
220
+ "modality_keys": [
221
+ "left_arm",
222
+ "right_arm",
223
+ "left_hand",
224
+ "right_hand",
225
+ "waist"
226
+ ],
227
+ "sin_cos_embedding_keys": [
228
+ "left_arm",
229
+ "right_arm",
230
+ "left_hand",
231
+ "right_hand",
232
+ "waist"
233
+ ],
234
+ "mean_std_embedding_keys": null,
235
+ "min_max_embedding_keys": null,
236
+ "pass_through_keys": null,
237
+ "action_configs": null
238
+ },
239
+ "action": {
240
+ "delta_indices": [
241
+ 0,
242
+ 1,
243
+ 2,
244
+ 3,
245
+ 4,
246
+ 5,
247
+ 6,
248
+ 7,
249
+ 8,
250
+ 9,
251
+ 10,
252
+ 11,
253
+ 12,
254
+ 13,
255
+ 14,
256
+ 15
257
+ ],
258
+ "modality_keys": [
259
+ "left_arm",
260
+ "right_arm",
261
+ "left_hand",
262
+ "right_hand",
263
+ "waist"
264
+ ],
265
+ "sin_cos_embedding_keys": null,
266
+ "mean_std_embedding_keys": null,
267
+ "min_max_embedding_keys": null,
268
+ "pass_through_keys": null,
269
+ "action_configs": [
270
+ {
271
+ "rep": "RELATIVE",
272
+ "type": "NON_EEF",
273
+ "format": "DEFAULT",
274
+ "state_key": null,
275
+ "input_rotation_format": "quat",
276
+ "input_quat_order": "xyzw",
277
+ "reference_rotation_format": "rot6d",
278
+ "reference_quat_order": "xyzw",
279
+ "translation_scaling_key": null,
280
+ "rotation_scaling_key": null,
281
+ "hold_through_clutch": false,
282
+ "normalization_type": "percentile"
283
+ },
284
+ {
285
+ "rep": "RELATIVE",
286
+ "type": "NON_EEF",
287
+ "format": "DEFAULT",
288
+ "state_key": null,
289
+ "input_rotation_format": "quat",
290
+ "input_quat_order": "xyzw",
291
+ "reference_rotation_format": "rot6d",
292
+ "reference_quat_order": "xyzw",
293
+ "translation_scaling_key": null,
294
+ "rotation_scaling_key": null,
295
+ "hold_through_clutch": false,
296
+ "normalization_type": "percentile"
297
+ },
298
+ {
299
+ "rep": "RELATIVE",
300
+ "type": "NON_EEF",
301
+ "format": "DEFAULT",
302
+ "state_key": null,
303
+ "input_rotation_format": "quat",
304
+ "input_quat_order": "xyzw",
305
+ "reference_rotation_format": "rot6d",
306
+ "reference_quat_order": "xyzw",
307
+ "translation_scaling_key": null,
308
+ "rotation_scaling_key": null,
309
+ "hold_through_clutch": false,
310
+ "normalization_type": "percentile"
311
+ },
312
+ {
313
+ "rep": "RELATIVE",
314
+ "type": "NON_EEF",
315
+ "format": "DEFAULT",
316
+ "state_key": null,
317
+ "input_rotation_format": "quat",
318
+ "input_quat_order": "xyzw",
319
+ "reference_rotation_format": "rot6d",
320
+ "reference_quat_order": "xyzw",
321
+ "translation_scaling_key": null,
322
+ "rotation_scaling_key": null,
323
+ "hold_through_clutch": false,
324
+ "normalization_type": "percentile"
325
+ },
326
+ {
327
+ "rep": "ABSOLUTE",
328
+ "type": "NON_EEF",
329
+ "format": "DEFAULT",
330
+ "state_key": null,
331
+ "input_rotation_format": "quat",
332
+ "input_quat_order": "xyzw",
333
+ "reference_rotation_format": "rot6d",
334
+ "reference_quat_order": "xyzw",
335
+ "translation_scaling_key": null,
336
+ "rotation_scaling_key": null,
337
+ "hold_through_clutch": false,
338
+ "normalization_type": "percentile"
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "task"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "min_max_embedding_keys": null,
352
+ "pass_through_keys": null,
353
+ "action_configs": null
354
+ }
355
+ },
356
+ "robocasa_panda_omron": {
357
+ "video": {
358
+ "delta_indices": [
359
+ 0
360
+ ],
361
+ "modality_keys": [
362
+ "res256_image_side_0",
363
+ "res256_image_side_1",
364
+ "res256_image_wrist_0"
365
+ ],
366
+ "sin_cos_embedding_keys": null,
367
+ "mean_std_embedding_keys": null,
368
+ "min_max_embedding_keys": null,
369
+ "pass_through_keys": null,
370
+ "action_configs": null
371
+ },
372
+ "state": {
373
+ "delta_indices": [
374
+ 0
375
+ ],
376
+ "modality_keys": [
377
+ "end_effector_position_relative",
378
+ "end_effector_rotation_relative",
379
+ "gripper_qpos",
380
+ "base_position",
381
+ "base_rotation"
382
+ ],
383
+ "sin_cos_embedding_keys": null,
384
+ "mean_std_embedding_keys": null,
385
+ "min_max_embedding_keys": null,
386
+ "pass_through_keys": null,
387
+ "action_configs": null
388
+ },
389
+ "action": {
390
+ "delta_indices": [
391
+ 0,
392
+ 1,
393
+ 2,
394
+ 3,
395
+ 4,
396
+ 5,
397
+ 6,
398
+ 7,
399
+ 8,
400
+ 9,
401
+ 10,
402
+ 11,
403
+ 12,
404
+ 13,
405
+ 14,
406
+ 15
407
+ ],
408
+ "modality_keys": [
409
+ "end_effector_position",
410
+ "end_effector_rotation",
411
+ "gripper_close",
412
+ "base_motion",
413
+ "control_mode"
414
+ ],
415
+ "sin_cos_embedding_keys": null,
416
+ "mean_std_embedding_keys": null,
417
+ "min_max_embedding_keys": null,
418
+ "pass_through_keys": null,
419
+ "action_configs": [
420
+ {
421
+ "rep": "ABSOLUTE",
422
+ "type": "NON_EEF",
423
+ "format": "DEFAULT",
424
+ "state_key": null,
425
+ "input_rotation_format": "quat",
426
+ "input_quat_order": "xyzw",
427
+ "reference_rotation_format": "rot6d",
428
+ "reference_quat_order": "xyzw",
429
+ "translation_scaling_key": null,
430
+ "rotation_scaling_key": null,
431
+ "hold_through_clutch": false,
432
+ "normalization_type": "percentile"
433
+ },
434
+ {
435
+ "rep": "ABSOLUTE",
436
+ "type": "NON_EEF",
437
+ "format": "DEFAULT",
438
+ "state_key": null,
439
+ "input_rotation_format": "quat",
440
+ "input_quat_order": "xyzw",
441
+ "reference_rotation_format": "rot6d",
442
+ "reference_quat_order": "xyzw",
443
+ "translation_scaling_key": null,
444
+ "rotation_scaling_key": null,
445
+ "hold_through_clutch": false,
446
+ "normalization_type": "percentile"
447
+ },
448
+ {
449
+ "rep": "ABSOLUTE",
450
+ "type": "NON_EEF",
451
+ "format": "DEFAULT",
452
+ "state_key": null,
453
+ "input_rotation_format": "quat",
454
+ "input_quat_order": "xyzw",
455
+ "reference_rotation_format": "rot6d",
456
+ "reference_quat_order": "xyzw",
457
+ "translation_scaling_key": null,
458
+ "rotation_scaling_key": null,
459
+ "hold_through_clutch": false,
460
+ "normalization_type": "percentile"
461
+ },
462
+ {
463
+ "rep": "ABSOLUTE",
464
+ "type": "NON_EEF",
465
+ "format": "DEFAULT",
466
+ "state_key": null,
467
+ "input_rotation_format": "quat",
468
+ "input_quat_order": "xyzw",
469
+ "reference_rotation_format": "rot6d",
470
+ "reference_quat_order": "xyzw",
471
+ "translation_scaling_key": null,
472
+ "rotation_scaling_key": null,
473
+ "hold_through_clutch": false,
474
+ "normalization_type": "percentile"
475
+ },
476
+ {
477
+ "rep": "ABSOLUTE",
478
+ "type": "NON_EEF",
479
+ "format": "DEFAULT",
480
+ "state_key": null,
481
+ "input_rotation_format": "quat",
482
+ "input_quat_order": "xyzw",
483
+ "reference_rotation_format": "rot6d",
484
+ "reference_quat_order": "xyzw",
485
+ "translation_scaling_key": null,
486
+ "rotation_scaling_key": null,
487
+ "hold_through_clutch": false,
488
+ "normalization_type": "percentile"
489
+ }
490
+ ]
491
+ },
492
+ "language": {
493
+ "delta_indices": [
494
+ 0
495
+ ],
496
+ "modality_keys": [
497
+ "annotation.human.action.task_description"
498
+ ],
499
+ "sin_cos_embedding_keys": null,
500
+ "mean_std_embedding_keys": null,
501
+ "min_max_embedding_keys": null,
502
+ "pass_through_keys": null,
503
+ "action_configs": null
504
+ }
505
+ },
506
+ "cmr_versius": {
507
+ "video": {
508
+ "delta_indices": [
509
+ 0
510
+ ],
511
+ "modality_keys": [
512
+ "endoscope"
513
+ ],
514
+ "sin_cos_embedding_keys": null,
515
+ "mean_std_embedding_keys": null,
516
+ "min_max_embedding_keys": null,
517
+ "pass_through_keys": null,
518
+ "action_configs": null
519
+ },
520
+ "state": {
521
+ "delta_indices": [
522
+ 0
523
+ ],
524
+ "modality_keys": [
525
+ "left_pose",
526
+ "left_gripper",
527
+ "right_pose",
528
+ "right_gripper",
529
+ "translation_scaling",
530
+ "rotation_scaling",
531
+ "hapticengaged_left",
532
+ "hapticengaged_right"
533
+ ],
534
+ "sin_cos_embedding_keys": null,
535
+ "mean_std_embedding_keys": [
536
+ "left_pose",
537
+ "left_gripper",
538
+ "right_pose",
539
+ "right_gripper"
540
+ ],
541
+ "min_max_embedding_keys": null,
542
+ "pass_through_keys": [
543
+ "translation_scaling",
544
+ "rotation_scaling",
545
+ "hapticengaged_left",
546
+ "hapticengaged_right"
547
+ ],
548
+ "action_configs": null
549
+ },
550
+ "action": {
551
+ "delta_indices": [
552
+ 2,
553
+ 4,
554
+ 6,
555
+ 8,
556
+ 10,
557
+ 12,
558
+ 14,
559
+ 16,
560
+ 18,
561
+ 20,
562
+ 22,
563
+ 24,
564
+ 26,
565
+ 28,
566
+ 30,
567
+ 32,
568
+ 34,
569
+ 36,
570
+ 38,
571
+ 40,
572
+ 42,
573
+ 44,
574
+ 46,
575
+ 48,
576
+ 50,
577
+ 52,
578
+ 54,
579
+ 56,
580
+ 58,
581
+ 60,
582
+ 62,
583
+ 64,
584
+ 66,
585
+ 68,
586
+ 70,
587
+ 72,
588
+ 74,
589
+ 76,
590
+ 78,
591
+ 80,
592
+ 82,
593
+ 84,
594
+ 86,
595
+ 88,
596
+ 90,
597
+ 92,
598
+ 94,
599
+ 96,
600
+ 98,
601
+ 100
602
+ ],
603
+ "modality_keys": [
604
+ "left_pose",
605
+ "left_gripper",
606
+ "right_pose",
607
+ "right_gripper",
608
+ "hapticengaged_left",
609
+ "hapticengaged_right"
610
+ ],
611
+ "sin_cos_embedding_keys": null,
612
+ "mean_std_embedding_keys": null,
613
+ "min_max_embedding_keys": null,
614
+ "pass_through_keys": [
615
+ "hapticengaged_left",
616
+ "hapticengaged_right"
617
+ ],
618
+ "action_configs": [
619
+ {
620
+ "rep": "REL_XYZ_ROT6D",
621
+ "type": "EEF",
622
+ "format": "XYZ_ROT6D",
623
+ "state_key": "left_pose",
624
+ "input_rotation_format": "quat",
625
+ "input_quat_order": "xyzw",
626
+ "reference_rotation_format": "quat",
627
+ "reference_quat_order": "xyzw",
628
+ "translation_scaling_key": "translation_scaling",
629
+ "rotation_scaling_key": "rotation_scaling",
630
+ "hold_through_clutch": false,
631
+ "normalization_type": "percentile"
632
+ },
633
+ {
634
+ "rep": "ABSOLUTE",
635
+ "type": "NON_EEF",
636
+ "format": "DEFAULT",
637
+ "state_key": "left_gripper",
638
+ "input_rotation_format": "quat",
639
+ "input_quat_order": "xyzw",
640
+ "reference_rotation_format": "rot6d",
641
+ "reference_quat_order": "xyzw",
642
+ "translation_scaling_key": null,
643
+ "rotation_scaling_key": null,
644
+ "hold_through_clutch": true,
645
+ "normalization_type": "percentile"
646
+ },
647
+ {
648
+ "rep": "REL_XYZ_ROT6D",
649
+ "type": "EEF",
650
+ "format": "XYZ_ROT6D",
651
+ "state_key": "right_pose",
652
+ "input_rotation_format": "quat",
653
+ "input_quat_order": "xyzw",
654
+ "reference_rotation_format": "quat",
655
+ "reference_quat_order": "xyzw",
656
+ "translation_scaling_key": "translation_scaling",
657
+ "rotation_scaling_key": "rotation_scaling",
658
+ "hold_through_clutch": false,
659
+ "normalization_type": "percentile"
660
+ },
661
+ {
662
+ "rep": "ABSOLUTE",
663
+ "type": "NON_EEF",
664
+ "format": "DEFAULT",
665
+ "state_key": "right_gripper",
666
+ "input_rotation_format": "quat",
667
+ "input_quat_order": "xyzw",
668
+ "reference_rotation_format": "rot6d",
669
+ "reference_quat_order": "xyzw",
670
+ "translation_scaling_key": null,
671
+ "rotation_scaling_key": null,
672
+ "hold_through_clutch": true,
673
+ "normalization_type": "percentile"
674
+ },
675
+ {
676
+ "rep": "ABSOLUTE",
677
+ "type": "NON_EEF",
678
+ "format": "DEFAULT",
679
+ "state_key": null,
680
+ "input_rotation_format": "quat",
681
+ "input_quat_order": "xyzw",
682
+ "reference_rotation_format": "rot6d",
683
+ "reference_quat_order": "xyzw",
684
+ "translation_scaling_key": null,
685
+ "rotation_scaling_key": null,
686
+ "hold_through_clutch": false,
687
+ "normalization_type": "skip"
688
+ },
689
+ {
690
+ "rep": "ABSOLUTE",
691
+ "type": "NON_EEF",
692
+ "format": "DEFAULT",
693
+ "state_key": null,
694
+ "input_rotation_format": "quat",
695
+ "input_quat_order": "xyzw",
696
+ "reference_rotation_format": "rot6d",
697
+ "reference_quat_order": "xyzw",
698
+ "translation_scaling_key": null,
699
+ "rotation_scaling_key": null,
700
+ "hold_through_clutch": false,
701
+ "normalization_type": "skip"
702
+ }
703
+ ]
704
+ },
705
+ "language": {
706
+ "delta_indices": [
707
+ 0
708
+ ],
709
+ "modality_keys": [
710
+ "annotation.human.task_description"
711
+ ],
712
+ "sin_cos_embedding_keys": null,
713
+ "mean_std_embedding_keys": null,
714
+ "min_max_embedding_keys": null,
715
+ "pass_through_keys": null,
716
+ "action_configs": null
717
+ }
718
+ },
719
+ "ucsd_dvrk": {
720
+ "video": {
721
+ "delta_indices": [
722
+ 0
723
+ ],
724
+ "modality_keys": [
725
+ "camera_left"
726
+ ],
727
+ "sin_cos_embedding_keys": null,
728
+ "mean_std_embedding_keys": null,
729
+ "min_max_embedding_keys": null,
730
+ "pass_through_keys": null,
731
+ "action_configs": null
732
+ },
733
+ "state": {
734
+ "delta_indices": [
735
+ 0
736
+ ],
737
+ "modality_keys": [
738
+ "psm_retraction_pose",
739
+ "psm_retraction_gripper",
740
+ "psm_cutter_pose",
741
+ "psm_cutter_gripper"
742
+ ],
743
+ "sin_cos_embedding_keys": null,
744
+ "mean_std_embedding_keys": [
745
+ "psm_retraction_pose",
746
+ "psm_retraction_gripper",
747
+ "psm_cutter_pose",
748
+ "psm_cutter_gripper"
749
+ ],
750
+ "min_max_embedding_keys": null,
751
+ "pass_through_keys": null,
752
+ "action_configs": null
753
+ },
754
+ "action": {
755
+ "delta_indices": [
756
+ 1,
757
+ 2,
758
+ 3,
759
+ 4,
760
+ 5,
761
+ 6,
762
+ 7,
763
+ 8,
764
+ 9,
765
+ 10,
766
+ 11,
767
+ 12,
768
+ 13,
769
+ 14,
770
+ 15,
771
+ 16,
772
+ 17,
773
+ 18,
774
+ 19,
775
+ 20,
776
+ 21,
777
+ 22,
778
+ 23,
779
+ 24,
780
+ 25,
781
+ 26,
782
+ 27,
783
+ 28,
784
+ 29,
785
+ 30,
786
+ 31,
787
+ 32,
788
+ 33,
789
+ 34,
790
+ 35,
791
+ 36,
792
+ 37,
793
+ 38,
794
+ 39,
795
+ 40,
796
+ 41,
797
+ 42,
798
+ 43,
799
+ 44,
800
+ 45,
801
+ 46,
802
+ 47,
803
+ 48,
804
+ 49,
805
+ 50
806
+ ],
807
+ "modality_keys": [
808
+ "psm_retraction_pose",
809
+ "psm_retraction_gripper",
810
+ "psm_cutter_pose",
811
+ "psm_cutter_gripper"
812
+ ],
813
+ "sin_cos_embedding_keys": null,
814
+ "mean_std_embedding_keys": null,
815
+ "min_max_embedding_keys": null,
816
+ "pass_through_keys": null,
817
+ "action_configs": [
818
+ {
819
+ "rep": "REL_XYZ_ROT6D",
820
+ "type": "EEF",
821
+ "format": "XYZ_ROT6D",
822
+ "state_key": "psm_retraction_pose",
823
+ "input_rotation_format": "quat",
824
+ "input_quat_order": "wxyz",
825
+ "reference_rotation_format": "quat",
826
+ "reference_quat_order": "wxyz",
827
+ "translation_scaling_key": null,
828
+ "rotation_scaling_key": null,
829
+ "hold_through_clutch": false,
830
+ "normalization_type": "percentile"
831
+ },
832
+ {
833
+ "rep": "ABSOLUTE",
834
+ "type": "NON_EEF",
835
+ "format": "DEFAULT",
836
+ "state_key": null,
837
+ "input_rotation_format": "quat",
838
+ "input_quat_order": "xyzw",
839
+ "reference_rotation_format": "rot6d",
840
+ "reference_quat_order": "xyzw",
841
+ "translation_scaling_key": null,
842
+ "rotation_scaling_key": null,
843
+ "hold_through_clutch": false,
844
+ "normalization_type": "percentile"
845
+ },
846
+ {
847
+ "rep": "REL_XYZ_ROT6D",
848
+ "type": "EEF",
849
+ "format": "XYZ_ROT6D",
850
+ "state_key": "psm_cutter_pose",
851
+ "input_rotation_format": "quat",
852
+ "input_quat_order": "wxyz",
853
+ "reference_rotation_format": "quat",
854
+ "reference_quat_order": "wxyz",
855
+ "translation_scaling_key": null,
856
+ "rotation_scaling_key": null,
857
+ "hold_through_clutch": false,
858
+ "normalization_type": "percentile"
859
+ },
860
+ {
861
+ "rep": "ABSOLUTE",
862
+ "type": "NON_EEF",
863
+ "format": "DEFAULT",
864
+ "state_key": null,
865
+ "input_rotation_format": "quat",
866
+ "input_quat_order": "xyzw",
867
+ "reference_rotation_format": "rot6d",
868
+ "reference_quat_order": "xyzw",
869
+ "translation_scaling_key": null,
870
+ "rotation_scaling_key": null,
871
+ "hold_through_clutch": false,
872
+ "normalization_type": "percentile"
873
+ }
874
+ ]
875
+ },
876
+ "language": {
877
+ "delta_indices": [
878
+ 0
879
+ ],
880
+ "modality_keys": [
881
+ "task"
882
+ ],
883
+ "sin_cos_embedding_keys": null,
884
+ "mean_std_embedding_keys": null,
885
+ "min_max_embedding_keys": null,
886
+ "pass_through_keys": null,
887
+ "action_configs": null
888
+ }
889
+ },
890
+ "jhu_imerse_dvrk": {
891
+ "video": {
892
+ "delta_indices": [
893
+ 0
894
+ ],
895
+ "modality_keys": [
896
+ "endoscope_left",
897
+ "wrist_left",
898
+ "wrist_right"
899
+ ],
900
+ "sin_cos_embedding_keys": null,
901
+ "mean_std_embedding_keys": null,
902
+ "min_max_embedding_keys": null,
903
+ "pass_through_keys": null,
904
+ "action_configs": null
905
+ },
906
+ "state": {
907
+ "delta_indices": [
908
+ 0
909
+ ],
910
+ "modality_keys": [
911
+ "psm1_pose",
912
+ "psm1_gripper",
913
+ "psm2_pose",
914
+ "psm2_gripper"
915
+ ],
916
+ "sin_cos_embedding_keys": null,
917
+ "mean_std_embedding_keys": [
918
+ "psm1_pose",
919
+ "psm1_gripper",
920
+ "psm2_pose",
921
+ "psm2_gripper"
922
+ ],
923
+ "min_max_embedding_keys": null,
924
+ "pass_through_keys": null,
925
+ "action_configs": null
926
+ },
927
+ "action": {
928
+ "delta_indices": [
929
+ 1,
930
+ 2,
931
+ 3,
932
+ 4,
933
+ 5,
934
+ 6,
935
+ 7,
936
+ 8,
937
+ 9,
938
+ 10,
939
+ 11,
940
+ 12,
941
+ 13,
942
+ 14,
943
+ 15,
944
+ 16,
945
+ 17,
946
+ 18,
947
+ 19,
948
+ 20,
949
+ 21,
950
+ 22,
951
+ 23,
952
+ 24,
953
+ 25,
954
+ 26,
955
+ 27,
956
+ 28,
957
+ 29,
958
+ 30,
959
+ 31,
960
+ 32,
961
+ 33,
962
+ 34,
963
+ 35,
964
+ 36,
965
+ 37,
966
+ 38,
967
+ 39,
968
+ 40,
969
+ 41,
970
+ 42,
971
+ 43,
972
+ 44,
973
+ 45,
974
+ 46,
975
+ 47,
976
+ 48,
977
+ 49,
978
+ 50
979
+ ],
980
+ "modality_keys": [
981
+ "psm1_pose",
982
+ "psm1_gripper",
983
+ "psm2_pose",
984
+ "psm2_gripper"
985
+ ],
986
+ "sin_cos_embedding_keys": null,
987
+ "mean_std_embedding_keys": null,
988
+ "min_max_embedding_keys": null,
989
+ "pass_through_keys": null,
990
+ "action_configs": [
991
+ {
992
+ "rep": "REL_XYZ_ROT6D",
993
+ "type": "EEF",
994
+ "format": "XYZ_ROT6D",
995
+ "state_key": "psm1_pose",
996
+ "input_rotation_format": "quat",
997
+ "input_quat_order": "xyzw",
998
+ "reference_rotation_format": "quat",
999
+ "reference_quat_order": "xyzw",
1000
+ "translation_scaling_key": null,
1001
+ "rotation_scaling_key": null,
1002
+ "hold_through_clutch": false,
1003
+ "normalization_type": "percentile"
1004
+ },
1005
+ {
1006
+ "rep": "ABSOLUTE",
1007
+ "type": "NON_EEF",
1008
+ "format": "DEFAULT",
1009
+ "state_key": null,
1010
+ "input_rotation_format": "quat",
1011
+ "input_quat_order": "xyzw",
1012
+ "reference_rotation_format": "rot6d",
1013
+ "reference_quat_order": "xyzw",
1014
+ "translation_scaling_key": null,
1015
+ "rotation_scaling_key": null,
1016
+ "hold_through_clutch": false,
1017
+ "normalization_type": "percentile"
1018
+ },
1019
+ {
1020
+ "rep": "REL_XYZ_ROT6D",
1021
+ "type": "EEF",
1022
+ "format": "XYZ_ROT6D",
1023
+ "state_key": "psm2_pose",
1024
+ "input_rotation_format": "quat",
1025
+ "input_quat_order": "xyzw",
1026
+ "reference_rotation_format": "quat",
1027
+ "reference_quat_order": "xyzw",
1028
+ "translation_scaling_key": null,
1029
+ "rotation_scaling_key": null,
1030
+ "hold_through_clutch": false,
1031
+ "normalization_type": "percentile"
1032
+ },
1033
+ {
1034
+ "rep": "ABSOLUTE",
1035
+ "type": "NON_EEF",
1036
+ "format": "DEFAULT",
1037
+ "state_key": null,
1038
+ "input_rotation_format": "quat",
1039
+ "input_quat_order": "xyzw",
1040
+ "reference_rotation_format": "rot6d",
1041
+ "reference_quat_order": "xyzw",
1042
+ "translation_scaling_key": null,
1043
+ "rotation_scaling_key": null,
1044
+ "hold_through_clutch": false,
1045
+ "normalization_type": "percentile"
1046
+ }
1047
+ ]
1048
+ },
1049
+ "language": {
1050
+ "delta_indices": [
1051
+ 0
1052
+ ],
1053
+ "modality_keys": [
1054
+ "annotation.human.task_description"
1055
+ ],
1056
+ "sin_cos_embedding_keys": null,
1057
+ "mean_std_embedding_keys": null,
1058
+ "min_max_embedding_keys": null,
1059
+ "pass_through_keys": null,
1060
+ "action_configs": null
1061
+ }
1062
+ },
1063
+ "obuda_dvrk": {
1064
+ "video": {
1065
+ "delta_indices": [
1066
+ 0
1067
+ ],
1068
+ "modality_keys": [
1069
+ "endoscope_left",
1070
+ "wrist_left",
1071
+ "wrist_right"
1072
+ ],
1073
+ "sin_cos_embedding_keys": null,
1074
+ "mean_std_embedding_keys": null,
1075
+ "min_max_embedding_keys": null,
1076
+ "pass_through_keys": null,
1077
+ "action_configs": null
1078
+ },
1079
+ "state": {
1080
+ "delta_indices": [
1081
+ 0
1082
+ ],
1083
+ "modality_keys": [
1084
+ "psm1_pose",
1085
+ "psm1_gripper",
1086
+ "psm2_pose",
1087
+ "psm2_gripper"
1088
+ ],
1089
+ "sin_cos_embedding_keys": null,
1090
+ "mean_std_embedding_keys": [
1091
+ "psm1_pose",
1092
+ "psm1_gripper",
1093
+ "psm2_pose",
1094
+ "psm2_gripper"
1095
+ ],
1096
+ "min_max_embedding_keys": null,
1097
+ "pass_through_keys": null,
1098
+ "action_configs": null
1099
+ },
1100
+ "action": {
1101
+ "delta_indices": [
1102
+ 0,
1103
+ 1,
1104
+ 2,
1105
+ 3,
1106
+ 4,
1107
+ 5,
1108
+ 6,
1109
+ 7,
1110
+ 8,
1111
+ 9,
1112
+ 10,
1113
+ 11,
1114
+ 12,
1115
+ 13,
1116
+ 14,
1117
+ 15,
1118
+ 16,
1119
+ 17,
1120
+ 18,
1121
+ 19,
1122
+ 20,
1123
+ 21,
1124
+ 22,
1125
+ 23,
1126
+ 24,
1127
+ 25,
1128
+ 26,
1129
+ 27,
1130
+ 28,
1131
+ 29,
1132
+ 30,
1133
+ 31,
1134
+ 32,
1135
+ 33,
1136
+ 34,
1137
+ 35,
1138
+ 36,
1139
+ 37,
1140
+ 38,
1141
+ 39,
1142
+ 40,
1143
+ 41,
1144
+ 42,
1145
+ 43,
1146
+ 44,
1147
+ 45,
1148
+ 46,
1149
+ 47,
1150
+ 48,
1151
+ 49
1152
+ ],
1153
+ "modality_keys": [
1154
+ "psm1_pose",
1155
+ "psm1_gripper",
1156
+ "psm2_pose",
1157
+ "psm2_gripper"
1158
+ ],
1159
+ "sin_cos_embedding_keys": null,
1160
+ "mean_std_embedding_keys": null,
1161
+ "min_max_embedding_keys": null,
1162
+ "pass_through_keys": null,
1163
+ "action_configs": [
1164
+ {
1165
+ "rep": "REL_XYZ_ROT6D",
1166
+ "type": "EEF",
1167
+ "format": "XYZ_ROT6D",
1168
+ "state_key": "psm1_pose",
1169
+ "input_rotation_format": "quat",
1170
+ "input_quat_order": "xyzw",
1171
+ "reference_rotation_format": "quat",
1172
+ "reference_quat_order": "xyzw",
1173
+ "translation_scaling_key": null,
1174
+ "rotation_scaling_key": null,
1175
+ "hold_through_clutch": false,
1176
+ "normalization_type": "percentile"
1177
+ },
1178
+ {
1179
+ "rep": "ABSOLUTE",
1180
+ "type": "NON_EEF",
1181
+ "format": "DEFAULT",
1182
+ "state_key": null,
1183
+ "input_rotation_format": "quat",
1184
+ "input_quat_order": "xyzw",
1185
+ "reference_rotation_format": "rot6d",
1186
+ "reference_quat_order": "xyzw",
1187
+ "translation_scaling_key": null,
1188
+ "rotation_scaling_key": null,
1189
+ "hold_through_clutch": false,
1190
+ "normalization_type": "percentile"
1191
+ },
1192
+ {
1193
+ "rep": "REL_XYZ_ROT6D",
1194
+ "type": "EEF",
1195
+ "format": "XYZ_ROT6D",
1196
+ "state_key": "psm2_pose",
1197
+ "input_rotation_format": "quat",
1198
+ "input_quat_order": "xyzw",
1199
+ "reference_rotation_format": "quat",
1200
+ "reference_quat_order": "xyzw",
1201
+ "translation_scaling_key": null,
1202
+ "rotation_scaling_key": null,
1203
+ "hold_through_clutch": false,
1204
+ "normalization_type": "percentile"
1205
+ },
1206
+ {
1207
+ "rep": "ABSOLUTE",
1208
+ "type": "NON_EEF",
1209
+ "format": "DEFAULT",
1210
+ "state_key": null,
1211
+ "input_rotation_format": "quat",
1212
+ "input_quat_order": "xyzw",
1213
+ "reference_rotation_format": "rot6d",
1214
+ "reference_quat_order": "xyzw",
1215
+ "translation_scaling_key": null,
1216
+ "rotation_scaling_key": null,
1217
+ "hold_through_clutch": false,
1218
+ "normalization_type": "percentile"
1219
+ }
1220
+ ]
1221
+ },
1222
+ "language": {
1223
+ "delta_indices": [
1224
+ 0
1225
+ ],
1226
+ "modality_keys": [
1227
+ "task"
1228
+ ],
1229
+ "sin_cos_embedding_keys": null,
1230
+ "mean_std_embedding_keys": null,
1231
+ "min_max_embedding_keys": null,
1232
+ "pass_through_keys": null,
1233
+ "action_configs": null
1234
+ }
1235
+ },
1236
+ "stanford_dvrk_real": {
1237
+ "video": {
1238
+ "delta_indices": [
1239
+ 0
1240
+ ],
1241
+ "modality_keys": [
1242
+ "endoscope_left"
1243
+ ],
1244
+ "sin_cos_embedding_keys": null,
1245
+ "mean_std_embedding_keys": null,
1246
+ "min_max_embedding_keys": null,
1247
+ "pass_through_keys": null,
1248
+ "action_configs": null
1249
+ },
1250
+ "state": {
1251
+ "delta_indices": [
1252
+ 0
1253
+ ],
1254
+ "modality_keys": [
1255
+ "psm1_pose",
1256
+ "psm1_gripper",
1257
+ "psm2_pose",
1258
+ "psm2_gripper"
1259
+ ],
1260
+ "sin_cos_embedding_keys": null,
1261
+ "mean_std_embedding_keys": [
1262
+ "psm1_pose",
1263
+ "psm1_gripper",
1264
+ "psm2_pose",
1265
+ "psm2_gripper"
1266
+ ],
1267
+ "min_max_embedding_keys": null,
1268
+ "pass_through_keys": null,
1269
+ "action_configs": null
1270
+ },
1271
+ "action": {
1272
+ "delta_indices": [
1273
+ 0,
1274
+ 1,
1275
+ 2,
1276
+ 3,
1277
+ 4,
1278
+ 5,
1279
+ 6,
1280
+ 7,
1281
+ 8,
1282
+ 9,
1283
+ 10,
1284
+ 11,
1285
+ 12,
1286
+ 13,
1287
+ 14,
1288
+ 15,
1289
+ 16,
1290
+ 17,
1291
+ 18,
1292
+ 19,
1293
+ 20,
1294
+ 21,
1295
+ 22,
1296
+ 23,
1297
+ 24,
1298
+ 25,
1299
+ 26,
1300
+ 27,
1301
+ 28,
1302
+ 29,
1303
+ 30,
1304
+ 31,
1305
+ 32,
1306
+ 33,
1307
+ 34,
1308
+ 35,
1309
+ 36,
1310
+ 37,
1311
+ 38,
1312
+ 39,
1313
+ 40,
1314
+ 41,
1315
+ 42,
1316
+ 43,
1317
+ 44,
1318
+ 45,
1319
+ 46,
1320
+ 47,
1321
+ 48,
1322
+ 49
1323
+ ],
1324
+ "modality_keys": [
1325
+ "psm1_pose",
1326
+ "psm1_gripper",
1327
+ "psm2_pose",
1328
+ "psm2_gripper"
1329
+ ],
1330
+ "sin_cos_embedding_keys": null,
1331
+ "mean_std_embedding_keys": null,
1332
+ "min_max_embedding_keys": null,
1333
+ "pass_through_keys": null,
1334
+ "action_configs": [
1335
+ {
1336
+ "rep": "REL_XYZ_ROT6D",
1337
+ "type": "EEF",
1338
+ "format": "XYZ_ROT6D",
1339
+ "state_key": "psm1_pose",
1340
+ "input_rotation_format": "euler",
1341
+ "input_quat_order": "xyzw",
1342
+ "reference_rotation_format": "euler",
1343
+ "reference_quat_order": "xyzw",
1344
+ "translation_scaling_key": null,
1345
+ "rotation_scaling_key": null,
1346
+ "hold_through_clutch": false,
1347
+ "normalization_type": "percentile"
1348
+ },
1349
+ {
1350
+ "rep": "ABSOLUTE",
1351
+ "type": "NON_EEF",
1352
+ "format": "DEFAULT",
1353
+ "state_key": null,
1354
+ "input_rotation_format": "quat",
1355
+ "input_quat_order": "xyzw",
1356
+ "reference_rotation_format": "rot6d",
1357
+ "reference_quat_order": "xyzw",
1358
+ "translation_scaling_key": null,
1359
+ "rotation_scaling_key": null,
1360
+ "hold_through_clutch": false,
1361
+ "normalization_type": "percentile"
1362
+ },
1363
+ {
1364
+ "rep": "REL_XYZ_ROT6D",
1365
+ "type": "EEF",
1366
+ "format": "XYZ_ROT6D",
1367
+ "state_key": "psm2_pose",
1368
+ "input_rotation_format": "euler",
1369
+ "input_quat_order": "xyzw",
1370
+ "reference_rotation_format": "euler",
1371
+ "reference_quat_order": "xyzw",
1372
+ "translation_scaling_key": null,
1373
+ "rotation_scaling_key": null,
1374
+ "hold_through_clutch": false,
1375
+ "normalization_type": "percentile"
1376
+ },
1377
+ {
1378
+ "rep": "ABSOLUTE",
1379
+ "type": "NON_EEF",
1380
+ "format": "DEFAULT",
1381
+ "state_key": null,
1382
+ "input_rotation_format": "quat",
1383
+ "input_quat_order": "xyzw",
1384
+ "reference_rotation_format": "rot6d",
1385
+ "reference_quat_order": "xyzw",
1386
+ "translation_scaling_key": null,
1387
+ "rotation_scaling_key": null,
1388
+ "hold_through_clutch": false,
1389
+ "normalization_type": "percentile"
1390
+ }
1391
+ ]
1392
+ },
1393
+ "language": {
1394
+ "delta_indices": [
1395
+ 0
1396
+ ],
1397
+ "modality_keys": [
1398
+ "task"
1399
+ ],
1400
+ "sin_cos_embedding_keys": null,
1401
+ "mean_std_embedding_keys": null,
1402
+ "min_max_embedding_keys": null,
1403
+ "pass_through_keys": null,
1404
+ "action_configs": null
1405
+ }
1406
+ },
1407
+ "tud_tundra_ur5e": {
1408
+ "video": {
1409
+ "delta_indices": [
1410
+ 0
1411
+ ],
1412
+ "modality_keys": [
1413
+ "laparoscope_left"
1414
+ ],
1415
+ "sin_cos_embedding_keys": null,
1416
+ "mean_std_embedding_keys": null,
1417
+ "min_max_embedding_keys": null,
1418
+ "pass_through_keys": null,
1419
+ "action_configs": null
1420
+ },
1421
+ "state": {
1422
+ "delta_indices": [
1423
+ 0
1424
+ ],
1425
+ "modality_keys": [
1426
+ "joint_position",
1427
+ "eef_pose"
1428
+ ],
1429
+ "sin_cos_embedding_keys": null,
1430
+ "mean_std_embedding_keys": [
1431
+ "joint_position"
1432
+ ],
1433
+ "min_max_embedding_keys": null,
1434
+ "pass_through_keys": [
1435
+ "eef_pose"
1436
+ ],
1437
+ "action_configs": null
1438
+ },
1439
+ "action": {
1440
+ "delta_indices": [
1441
+ 1,
1442
+ 2,
1443
+ 3,
1444
+ 4,
1445
+ 5,
1446
+ 6,
1447
+ 7,
1448
+ 8,
1449
+ 9,
1450
+ 10,
1451
+ 11,
1452
+ 12,
1453
+ 13,
1454
+ 14,
1455
+ 15,
1456
+ 16,
1457
+ 17,
1458
+ 18,
1459
+ 19,
1460
+ 20,
1461
+ 21,
1462
+ 22,
1463
+ 23,
1464
+ 24,
1465
+ 25,
1466
+ 26,
1467
+ 27,
1468
+ 28,
1469
+ 29,
1470
+ 30,
1471
+ 31,
1472
+ 32,
1473
+ 33,
1474
+ 34,
1475
+ 35,
1476
+ 36,
1477
+ 37,
1478
+ 38,
1479
+ 39,
1480
+ 40,
1481
+ 41,
1482
+ 42,
1483
+ 43,
1484
+ 44,
1485
+ 45,
1486
+ 46,
1487
+ 47,
1488
+ 48,
1489
+ 49,
1490
+ 50
1491
+ ],
1492
+ "modality_keys": [
1493
+ "eef_pose",
1494
+ "gripper"
1495
+ ],
1496
+ "sin_cos_embedding_keys": null,
1497
+ "mean_std_embedding_keys": null,
1498
+ "min_max_embedding_keys": null,
1499
+ "pass_through_keys": null,
1500
+ "action_configs": [
1501
+ {
1502
+ "rep": "REL_XYZ_ROT6D",
1503
+ "type": "EEF",
1504
+ "format": "XYZ_ROT6D",
1505
+ "state_key": "eef_pose",
1506
+ "input_rotation_format": "quat",
1507
+ "input_quat_order": "xyzw",
1508
+ "reference_rotation_format": "quat",
1509
+ "reference_quat_order": "xyzw",
1510
+ "translation_scaling_key": null,
1511
+ "rotation_scaling_key": null,
1512
+ "hold_through_clutch": false,
1513
+ "normalization_type": "percentile"
1514
+ },
1515
+ {
1516
+ "rep": "ABSOLUTE",
1517
+ "type": "NON_EEF",
1518
+ "format": "DEFAULT",
1519
+ "state_key": null,
1520
+ "input_rotation_format": "quat",
1521
+ "input_quat_order": "xyzw",
1522
+ "reference_rotation_format": "rot6d",
1523
+ "reference_quat_order": "xyzw",
1524
+ "translation_scaling_key": null,
1525
+ "rotation_scaling_key": null,
1526
+ "hold_through_clutch": false,
1527
+ "normalization_type": "percentile"
1528
+ }
1529
+ ]
1530
+ },
1531
+ "language": {
1532
+ "delta_indices": [
1533
+ 0
1534
+ ],
1535
+ "modality_keys": [
1536
+ "task"
1537
+ ],
1538
+ "sin_cos_embedding_keys": null,
1539
+ "mean_std_embedding_keys": null,
1540
+ "min_max_embedding_keys": null,
1541
+ "pass_through_keys": null,
1542
+ "action_configs": null
1543
+ }
1544
+ },
1545
+ "jhu_lscr_dvrk_smarts": {
1546
+ "video": {
1547
+ "delta_indices": [
1548
+ 0
1549
+ ],
1550
+ "modality_keys": [
1551
+ "endoscope_left",
1552
+ "camera_side_view"
1553
+ ],
1554
+ "sin_cos_embedding_keys": null,
1555
+ "mean_std_embedding_keys": null,
1556
+ "min_max_embedding_keys": null,
1557
+ "pass_through_keys": null,
1558
+ "action_configs": null
1559
+ },
1560
+ "state": {
1561
+ "delta_indices": [
1562
+ 0
1563
+ ],
1564
+ "modality_keys": [
1565
+ "psm1_pose",
1566
+ "psm1_gripper",
1567
+ "psm2_pose",
1568
+ "psm2_gripper"
1569
+ ],
1570
+ "sin_cos_embedding_keys": null,
1571
+ "mean_std_embedding_keys": [
1572
+ "psm1_pose",
1573
+ "psm1_gripper",
1574
+ "psm2_pose",
1575
+ "psm2_gripper"
1576
+ ],
1577
+ "min_max_embedding_keys": null,
1578
+ "pass_through_keys": null,
1579
+ "action_configs": null
1580
+ },
1581
+ "action": {
1582
+ "delta_indices": [
1583
+ 1,
1584
+ 2,
1585
+ 3,
1586
+ 4,
1587
+ 5,
1588
+ 6,
1589
+ 7,
1590
+ 8,
1591
+ 9,
1592
+ 10,
1593
+ 11,
1594
+ 12,
1595
+ 13,
1596
+ 14,
1597
+ 15,
1598
+ 16
1599
+ ],
1600
+ "modality_keys": [
1601
+ "psm1_pose",
1602
+ "psm1_gripper",
1603
+ "psm2_pose",
1604
+ "psm2_gripper"
1605
+ ],
1606
+ "sin_cos_embedding_keys": null,
1607
+ "mean_std_embedding_keys": null,
1608
+ "min_max_embedding_keys": null,
1609
+ "pass_through_keys": null,
1610
+ "action_configs": [
1611
+ {
1612
+ "rep": "REL_XYZ_ROT6D",
1613
+ "type": "EEF",
1614
+ "format": "XYZ_ROT6D",
1615
+ "state_key": "psm1_pose",
1616
+ "input_rotation_format": "quat",
1617
+ "input_quat_order": "xyzw",
1618
+ "reference_rotation_format": "quat",
1619
+ "reference_quat_order": "xyzw",
1620
+ "translation_scaling_key": null,
1621
+ "rotation_scaling_key": null,
1622
+ "hold_through_clutch": false,
1623
+ "normalization_type": "percentile"
1624
+ },
1625
+ {
1626
+ "rep": "ABSOLUTE",
1627
+ "type": "NON_EEF",
1628
+ "format": "DEFAULT",
1629
+ "state_key": null,
1630
+ "input_rotation_format": "quat",
1631
+ "input_quat_order": "xyzw",
1632
+ "reference_rotation_format": "rot6d",
1633
+ "reference_quat_order": "xyzw",
1634
+ "translation_scaling_key": null,
1635
+ "rotation_scaling_key": null,
1636
+ "hold_through_clutch": false,
1637
+ "normalization_type": "percentile"
1638
+ },
1639
+ {
1640
+ "rep": "REL_XYZ_ROT6D",
1641
+ "type": "EEF",
1642
+ "format": "XYZ_ROT6D",
1643
+ "state_key": "psm2_pose",
1644
+ "input_rotation_format": "quat",
1645
+ "input_quat_order": "xyzw",
1646
+ "reference_rotation_format": "quat",
1647
+ "reference_quat_order": "xyzw",
1648
+ "translation_scaling_key": null,
1649
+ "rotation_scaling_key": null,
1650
+ "hold_through_clutch": false,
1651
+ "normalization_type": "percentile"
1652
+ },
1653
+ {
1654
+ "rep": "ABSOLUTE",
1655
+ "type": "NON_EEF",
1656
+ "format": "DEFAULT",
1657
+ "state_key": null,
1658
+ "input_rotation_format": "quat",
1659
+ "input_quat_order": "xyzw",
1660
+ "reference_rotation_format": "rot6d",
1661
+ "reference_quat_order": "xyzw",
1662
+ "translation_scaling_key": null,
1663
+ "rotation_scaling_key": null,
1664
+ "hold_through_clutch": false,
1665
+ "normalization_type": "percentile"
1666
+ }
1667
+ ]
1668
+ },
1669
+ "language": {
1670
+ "delta_indices": [
1671
+ 0
1672
+ ],
1673
+ "modality_keys": [
1674
+ "annotation.task"
1675
+ ],
1676
+ "sin_cos_embedding_keys": null,
1677
+ "mean_std_embedding_keys": null,
1678
+ "min_max_embedding_keys": null,
1679
+ "pass_through_keys": null,
1680
+ "action_configs": null
1681
+ }
1682
+ },
1683
+ "jhu_imerse_dvrk_mono": {
1684
+ "video": {
1685
+ "delta_indices": [
1686
+ 0
1687
+ ],
1688
+ "modality_keys": [
1689
+ "endoscope_left"
1690
+ ],
1691
+ "sin_cos_embedding_keys": null,
1692
+ "mean_std_embedding_keys": null,
1693
+ "min_max_embedding_keys": null,
1694
+ "pass_through_keys": null,
1695
+ "action_configs": null
1696
+ },
1697
+ "state": {
1698
+ "delta_indices": [
1699
+ 0
1700
+ ],
1701
+ "modality_keys": [
1702
+ "psm1_pose",
1703
+ "psm1_gripper",
1704
+ "psm2_pose",
1705
+ "psm2_gripper"
1706
+ ],
1707
+ "sin_cos_embedding_keys": null,
1708
+ "mean_std_embedding_keys": [
1709
+ "psm1_pose",
1710
+ "psm1_gripper",
1711
+ "psm2_pose",
1712
+ "psm2_gripper"
1713
+ ],
1714
+ "min_max_embedding_keys": null,
1715
+ "pass_through_keys": null,
1716
+ "action_configs": null
1717
+ },
1718
+ "action": {
1719
+ "delta_indices": [
1720
+ 0,
1721
+ 1,
1722
+ 2,
1723
+ 3,
1724
+ 4,
1725
+ 5,
1726
+ 6,
1727
+ 7,
1728
+ 8,
1729
+ 9,
1730
+ 10,
1731
+ 11,
1732
+ 12,
1733
+ 13,
1734
+ 14,
1735
+ 15,
1736
+ 16,
1737
+ 17,
1738
+ 18,
1739
+ 19,
1740
+ 20,
1741
+ 21,
1742
+ 22,
1743
+ 23,
1744
+ 24,
1745
+ 25,
1746
+ 26,
1747
+ 27,
1748
+ 28,
1749
+ 29,
1750
+ 30,
1751
+ 31,
1752
+ 32,
1753
+ 33,
1754
+ 34,
1755
+ 35,
1756
+ 36,
1757
+ 37,
1758
+ 38,
1759
+ 39,
1760
+ 40,
1761
+ 41,
1762
+ 42,
1763
+ 43,
1764
+ 44,
1765
+ 45,
1766
+ 46,
1767
+ 47,
1768
+ 48,
1769
+ 49
1770
+ ],
1771
+ "modality_keys": [
1772
+ "psm1_pose",
1773
+ "psm1_gripper",
1774
+ "psm2_pose",
1775
+ "psm2_gripper"
1776
+ ],
1777
+ "sin_cos_embedding_keys": null,
1778
+ "mean_std_embedding_keys": null,
1779
+ "min_max_embedding_keys": null,
1780
+ "pass_through_keys": null,
1781
+ "action_configs": [
1782
+ {
1783
+ "rep": "REL_XYZ_ROT6D",
1784
+ "type": "EEF",
1785
+ "format": "XYZ_ROT6D",
1786
+ "state_key": "psm1_pose",
1787
+ "input_rotation_format": "quat",
1788
+ "input_quat_order": "xyzw",
1789
+ "reference_rotation_format": "quat",
1790
+ "reference_quat_order": "xyzw",
1791
+ "translation_scaling_key": null,
1792
+ "rotation_scaling_key": null,
1793
+ "hold_through_clutch": false,
1794
+ "normalization_type": "temporal_meanstd"
1795
+ },
1796
+ {
1797
+ "rep": "ABSOLUTE",
1798
+ "type": "NON_EEF",
1799
+ "format": "DEFAULT",
1800
+ "state_key": null,
1801
+ "input_rotation_format": "quat",
1802
+ "input_quat_order": "xyzw",
1803
+ "reference_rotation_format": "rot6d",
1804
+ "reference_quat_order": "xyzw",
1805
+ "translation_scaling_key": null,
1806
+ "rotation_scaling_key": null,
1807
+ "hold_through_clutch": false,
1808
+ "normalization_type": "temporal_meanstd"
1809
+ },
1810
+ {
1811
+ "rep": "REL_XYZ_ROT6D",
1812
+ "type": "EEF",
1813
+ "format": "XYZ_ROT6D",
1814
+ "state_key": "psm2_pose",
1815
+ "input_rotation_format": "quat",
1816
+ "input_quat_order": "xyzw",
1817
+ "reference_rotation_format": "quat",
1818
+ "reference_quat_order": "xyzw",
1819
+ "translation_scaling_key": null,
1820
+ "rotation_scaling_key": null,
1821
+ "hold_through_clutch": false,
1822
+ "normalization_type": "temporal_meanstd"
1823
+ },
1824
+ {
1825
+ "rep": "ABSOLUTE",
1826
+ "type": "NON_EEF",
1827
+ "format": "DEFAULT",
1828
+ "state_key": null,
1829
+ "input_rotation_format": "quat",
1830
+ "input_quat_order": "xyzw",
1831
+ "reference_rotation_format": "rot6d",
1832
+ "reference_quat_order": "xyzw",
1833
+ "translation_scaling_key": null,
1834
+ "rotation_scaling_key": null,
1835
+ "hold_through_clutch": false,
1836
+ "normalization_type": "temporal_meanstd"
1837
+ }
1838
+ ]
1839
+ },
1840
+ "language": {
1841
+ "delta_indices": [
1842
+ 0
1843
+ ],
1844
+ "modality_keys": [
1845
+ "annotation.human.task_description"
1846
+ ],
1847
+ "sin_cos_embedding_keys": null,
1848
+ "mean_std_embedding_keys": null,
1849
+ "min_max_embedding_keys": null,
1850
+ "pass_through_keys": null,
1851
+ "action_configs": null
1852
+ }
1853
+ },
1854
+ "rob_surgical_bitrack": {
1855
+ "video": {
1856
+ "delta_indices": [
1857
+ 0
1858
+ ],
1859
+ "modality_keys": [
1860
+ "endoscope"
1861
+ ],
1862
+ "sin_cos_embedding_keys": null,
1863
+ "mean_std_embedding_keys": null,
1864
+ "min_max_embedding_keys": null,
1865
+ "pass_through_keys": null,
1866
+ "action_configs": null
1867
+ },
1868
+ "state": {
1869
+ "delta_indices": [
1870
+ 0
1871
+ ],
1872
+ "modality_keys": [
1873
+ "left_pose",
1874
+ "right_pose",
1875
+ "aux_pose"
1876
+ ],
1877
+ "sin_cos_embedding_keys": null,
1878
+ "mean_std_embedding_keys": [
1879
+ "left_pose",
1880
+ "right_pose",
1881
+ "aux_pose"
1882
+ ],
1883
+ "min_max_embedding_keys": null,
1884
+ "pass_through_keys": null,
1885
+ "action_configs": null
1886
+ },
1887
+ "action": {
1888
+ "delta_indices": [
1889
+ 0,
1890
+ 1,
1891
+ 2,
1892
+ 3,
1893
+ 4,
1894
+ 5,
1895
+ 6,
1896
+ 7,
1897
+ 8,
1898
+ 9,
1899
+ 10,
1900
+ 11,
1901
+ 12,
1902
+ 13,
1903
+ 14,
1904
+ 15,
1905
+ 16,
1906
+ 17,
1907
+ 18,
1908
+ 19,
1909
+ 20,
1910
+ 21,
1911
+ 22,
1912
+ 23,
1913
+ 24,
1914
+ 25,
1915
+ 26,
1916
+ 27,
1917
+ 28,
1918
+ 29,
1919
+ 30,
1920
+ 31,
1921
+ 32,
1922
+ 33,
1923
+ 34,
1924
+ 35,
1925
+ 36,
1926
+ 37,
1927
+ 38,
1928
+ 39,
1929
+ 40,
1930
+ 41,
1931
+ 42,
1932
+ 43,
1933
+ 44,
1934
+ 45,
1935
+ 46,
1936
+ 47,
1937
+ 48,
1938
+ 49
1939
+ ],
1940
+ "modality_keys": [
1941
+ "left_pose",
1942
+ "right_pose",
1943
+ "aux_pose"
1944
+ ],
1945
+ "sin_cos_embedding_keys": null,
1946
+ "mean_std_embedding_keys": null,
1947
+ "min_max_embedding_keys": null,
1948
+ "pass_through_keys": null,
1949
+ "action_configs": [
1950
+ {
1951
+ "rep": "REL_XYZ_ROT6D",
1952
+ "type": "EEF",
1953
+ "format": "XYZ_ROT6D",
1954
+ "state_key": "left_pose",
1955
+ "input_rotation_format": "euler",
1956
+ "input_quat_order": "xyzw",
1957
+ "reference_rotation_format": "euler",
1958
+ "reference_quat_order": "xyzw",
1959
+ "translation_scaling_key": null,
1960
+ "rotation_scaling_key": null,
1961
+ "hold_through_clutch": false,
1962
+ "normalization_type": "percentile"
1963
+ },
1964
+ {
1965
+ "rep": "REL_XYZ_ROT6D",
1966
+ "type": "EEF",
1967
+ "format": "XYZ_ROT6D",
1968
+ "state_key": "right_pose",
1969
+ "input_rotation_format": "euler",
1970
+ "input_quat_order": "xyzw",
1971
+ "reference_rotation_format": "euler",
1972
+ "reference_quat_order": "xyzw",
1973
+ "translation_scaling_key": null,
1974
+ "rotation_scaling_key": null,
1975
+ "hold_through_clutch": false,
1976
+ "normalization_type": "percentile"
1977
+ },
1978
+ {
1979
+ "rep": "REL_XYZ_ROT6D",
1980
+ "type": "EEF",
1981
+ "format": "XYZ_ROT6D",
1982
+ "state_key": "aux_pose",
1983
+ "input_rotation_format": "euler",
1984
+ "input_quat_order": "xyzw",
1985
+ "reference_rotation_format": "euler",
1986
+ "reference_quat_order": "xyzw",
1987
+ "translation_scaling_key": null,
1988
+ "rotation_scaling_key": null,
1989
+ "hold_through_clutch": false,
1990
+ "normalization_type": "percentile"
1991
+ }
1992
+ ]
1993
+ },
1994
+ "language": {
1995
+ "delta_indices": [
1996
+ 0
1997
+ ],
1998
+ "modality_keys": [
1999
+ "annotation.instruction"
2000
+ ],
2001
+ "sin_cos_embedding_keys": null,
2002
+ "mean_std_embedding_keys": null,
2003
+ "min_max_embedding_keys": null,
2004
+ "pass_through_keys": null,
2005
+ "action_configs": null
2006
+ }
2007
+ },
2008
+ "turin_mitic_ex_vivo": {
2009
+ "video": {
2010
+ "delta_indices": [
2011
+ 0
2012
+ ],
2013
+ "modality_keys": [
2014
+ "endoscope_left"
2015
+ ],
2016
+ "sin_cos_embedding_keys": null,
2017
+ "mean_std_embedding_keys": null,
2018
+ "min_max_embedding_keys": null,
2019
+ "pass_through_keys": null,
2020
+ "action_configs": null
2021
+ },
2022
+ "state": {
2023
+ "delta_indices": [
2024
+ 0
2025
+ ],
2026
+ "modality_keys": [
2027
+ "psm1_joints",
2028
+ "psm2_joints",
2029
+ "psm1_pose",
2030
+ "psm2_pose"
2031
+ ],
2032
+ "sin_cos_embedding_keys": null,
2033
+ "mean_std_embedding_keys": [
2034
+ "psm1_joints",
2035
+ "psm2_joints"
2036
+ ],
2037
+ "min_max_embedding_keys": null,
2038
+ "pass_through_keys": [
2039
+ "psm1_pose",
2040
+ "psm2_pose"
2041
+ ],
2042
+ "action_configs": null
2043
+ },
2044
+ "action": {
2045
+ "delta_indices": [
2046
+ 1,
2047
+ 2,
2048
+ 3,
2049
+ 4,
2050
+ 5,
2051
+ 6,
2052
+ 7,
2053
+ 8,
2054
+ 9,
2055
+ 10,
2056
+ 11,
2057
+ 12,
2058
+ 13,
2059
+ 14,
2060
+ 15,
2061
+ 16,
2062
+ 17,
2063
+ 18,
2064
+ 19,
2065
+ 20,
2066
+ 21,
2067
+ 22,
2068
+ 23,
2069
+ 24,
2070
+ 25,
2071
+ 26,
2072
+ 27,
2073
+ 28,
2074
+ 29,
2075
+ 30,
2076
+ 31,
2077
+ 32,
2078
+ 33,
2079
+ 34,
2080
+ 35,
2081
+ 36,
2082
+ 37,
2083
+ 38,
2084
+ 39,
2085
+ 40,
2086
+ 41,
2087
+ 42,
2088
+ 43,
2089
+ 44,
2090
+ 45,
2091
+ 46,
2092
+ 47,
2093
+ 48,
2094
+ 49,
2095
+ 50
2096
+ ],
2097
+ "modality_keys": [
2098
+ "psm1_pose",
2099
+ "psm2_pose"
2100
+ ],
2101
+ "sin_cos_embedding_keys": null,
2102
+ "mean_std_embedding_keys": null,
2103
+ "min_max_embedding_keys": null,
2104
+ "pass_through_keys": null,
2105
+ "action_configs": [
2106
+ {
2107
+ "rep": "REL_XYZ_ROT6D",
2108
+ "type": "EEF",
2109
+ "format": "XYZ_ROT6D",
2110
+ "state_key": "psm1_pose",
2111
+ "input_rotation_format": "quat",
2112
+ "input_quat_order": "xyzw",
2113
+ "reference_rotation_format": "quat",
2114
+ "reference_quat_order": "xyzw",
2115
+ "translation_scaling_key": null,
2116
+ "rotation_scaling_key": null,
2117
+ "hold_through_clutch": false,
2118
+ "normalization_type": "percentile"
2119
+ },
2120
+ {
2121
+ "rep": "REL_XYZ_ROT6D",
2122
+ "type": "EEF",
2123
+ "format": "XYZ_ROT6D",
2124
+ "state_key": "psm2_pose",
2125
+ "input_rotation_format": "quat",
2126
+ "input_quat_order": "xyzw",
2127
+ "reference_rotation_format": "quat",
2128
+ "reference_quat_order": "xyzw",
2129
+ "translation_scaling_key": null,
2130
+ "rotation_scaling_key": null,
2131
+ "hold_through_clutch": false,
2132
+ "normalization_type": "percentile"
2133
+ }
2134
+ ]
2135
+ },
2136
+ "language": {
2137
+ "delta_indices": [
2138
+ 0
2139
+ ],
2140
+ "modality_keys": [
2141
+ "annotation.instruction"
2142
+ ],
2143
+ "sin_cos_embedding_keys": null,
2144
+ "mean_std_embedding_keys": null,
2145
+ "min_max_embedding_keys": null,
2146
+ "pass_through_keys": null,
2147
+ "action_configs": null
2148
+ }
2149
+ },
2150
+ "ustc_torin_tuodao": {
2151
+ "video": {
2152
+ "delta_indices": [
2153
+ 0
2154
+ ],
2155
+ "modality_keys": [
2156
+ "endoscope_left"
2157
+ ],
2158
+ "sin_cos_embedding_keys": null,
2159
+ "mean_std_embedding_keys": null,
2160
+ "min_max_embedding_keys": null,
2161
+ "pass_through_keys": null,
2162
+ "action_configs": null
2163
+ },
2164
+ "state": {
2165
+ "delta_indices": [
2166
+ 0
2167
+ ],
2168
+ "modality_keys": [
2169
+ "left_joints",
2170
+ "right_joints",
2171
+ "left_pose",
2172
+ "right_pose"
2173
+ ],
2174
+ "sin_cos_embedding_keys": null,
2175
+ "mean_std_embedding_keys": [
2176
+ "left_joints",
2177
+ "right_joints"
2178
+ ],
2179
+ "min_max_embedding_keys": null,
2180
+ "pass_through_keys": [
2181
+ "left_pose",
2182
+ "right_pose"
2183
+ ],
2184
+ "action_configs": null
2185
+ },
2186
+ "action": {
2187
+ "delta_indices": [
2188
+ 0,
2189
+ 1,
2190
+ 2,
2191
+ 3,
2192
+ 4,
2193
+ 5,
2194
+ 6,
2195
+ 7,
2196
+ 8,
2197
+ 9,
2198
+ 10,
2199
+ 11,
2200
+ 12,
2201
+ 13,
2202
+ 14,
2203
+ 15,
2204
+ 16,
2205
+ 17,
2206
+ 18,
2207
+ 19,
2208
+ 20,
2209
+ 21,
2210
+ 22,
2211
+ 23,
2212
+ 24,
2213
+ 25,
2214
+ 26,
2215
+ 27,
2216
+ 28,
2217
+ 29,
2218
+ 30,
2219
+ 31,
2220
+ 32,
2221
+ 33,
2222
+ 34,
2223
+ 35,
2224
+ 36,
2225
+ 37,
2226
+ 38,
2227
+ 39,
2228
+ 40,
2229
+ 41,
2230
+ 42,
2231
+ 43,
2232
+ 44,
2233
+ 45,
2234
+ 46,
2235
+ 47,
2236
+ 48,
2237
+ 49
2238
+ ],
2239
+ "modality_keys": [
2240
+ "left_pose",
2241
+ "left_gripper",
2242
+ "right_pose",
2243
+ "right_gripper"
2244
+ ],
2245
+ "sin_cos_embedding_keys": null,
2246
+ "mean_std_embedding_keys": null,
2247
+ "min_max_embedding_keys": null,
2248
+ "pass_through_keys": null,
2249
+ "action_configs": [
2250
+ {
2251
+ "rep": "REL_XYZ_ROT6D",
2252
+ "type": "EEF",
2253
+ "format": "XYZ_ROT6D",
2254
+ "state_key": "left_pose",
2255
+ "input_rotation_format": "quat",
2256
+ "input_quat_order": "xyzw",
2257
+ "reference_rotation_format": "quat",
2258
+ "reference_quat_order": "xyzw",
2259
+ "translation_scaling_key": null,
2260
+ "rotation_scaling_key": null,
2261
+ "hold_through_clutch": false,
2262
+ "normalization_type": "percentile"
2263
+ },
2264
+ {
2265
+ "rep": "ABSOLUTE",
2266
+ "type": "NON_EEF",
2267
+ "format": "DEFAULT",
2268
+ "state_key": null,
2269
+ "input_rotation_format": "quat",
2270
+ "input_quat_order": "xyzw",
2271
+ "reference_rotation_format": "rot6d",
2272
+ "reference_quat_order": "xyzw",
2273
+ "translation_scaling_key": null,
2274
+ "rotation_scaling_key": null,
2275
+ "hold_through_clutch": false,
2276
+ "normalization_type": "percentile"
2277
+ },
2278
+ {
2279
+ "rep": "REL_XYZ_ROT6D",
2280
+ "type": "EEF",
2281
+ "format": "XYZ_ROT6D",
2282
+ "state_key": "right_pose",
2283
+ "input_rotation_format": "quat",
2284
+ "input_quat_order": "xyzw",
2285
+ "reference_rotation_format": "quat",
2286
+ "reference_quat_order": "xyzw",
2287
+ "translation_scaling_key": null,
2288
+ "rotation_scaling_key": null,
2289
+ "hold_through_clutch": false,
2290
+ "normalization_type": "percentile"
2291
+ },
2292
+ {
2293
+ "rep": "ABSOLUTE",
2294
+ "type": "NON_EEF",
2295
+ "format": "DEFAULT",
2296
+ "state_key": null,
2297
+ "input_rotation_format": "quat",
2298
+ "input_quat_order": "xyzw",
2299
+ "reference_rotation_format": "rot6d",
2300
+ "reference_quat_order": "xyzw",
2301
+ "translation_scaling_key": null,
2302
+ "rotation_scaling_key": null,
2303
+ "hold_through_clutch": false,
2304
+ "normalization_type": "percentile"
2305
+ }
2306
+ ]
2307
+ },
2308
+ "language": {
2309
+ "delta_indices": [
2310
+ 0
2311
+ ],
2312
+ "modality_keys": [
2313
+ "annotation.instruction"
2314
+ ],
2315
+ "sin_cos_embedding_keys": null,
2316
+ "mean_std_embedding_keys": null,
2317
+ "min_max_embedding_keys": null,
2318
+ "pass_through_keys": null,
2319
+ "action_configs": null
2320
+ }
2321
+ },
2322
+ "hamlyn_dvrk_30hz": {
2323
+ "video": {
2324
+ "delta_indices": [
2325
+ 0
2326
+ ],
2327
+ "modality_keys": [
2328
+ "endoscope",
2329
+ "wrist_left",
2330
+ "wrist_right"
2331
+ ],
2332
+ "sin_cos_embedding_keys": null,
2333
+ "mean_std_embedding_keys": null,
2334
+ "min_max_embedding_keys": null,
2335
+ "pass_through_keys": null,
2336
+ "action_configs": null
2337
+ },
2338
+ "state": {
2339
+ "delta_indices": [
2340
+ 0
2341
+ ],
2342
+ "modality_keys": [
2343
+ "left_arm_pose",
2344
+ "left_arm_gripper",
2345
+ "right_arm_pose",
2346
+ "right_arm_gripper"
2347
+ ],
2348
+ "sin_cos_embedding_keys": null,
2349
+ "mean_std_embedding_keys": [
2350
+ "left_arm_pose",
2351
+ "left_arm_gripper",
2352
+ "right_arm_pose",
2353
+ "right_arm_gripper"
2354
+ ],
2355
+ "min_max_embedding_keys": null,
2356
+ "pass_through_keys": null,
2357
+ "action_configs": null
2358
+ },
2359
+ "action": {
2360
+ "delta_indices": [
2361
+ 0,
2362
+ 1,
2363
+ 2,
2364
+ 3,
2365
+ 4,
2366
+ 5,
2367
+ 6,
2368
+ 7,
2369
+ 8,
2370
+ 9,
2371
+ 10,
2372
+ 11,
2373
+ 12,
2374
+ 13,
2375
+ 14,
2376
+ 15,
2377
+ 16,
2378
+ 17,
2379
+ 18,
2380
+ 19,
2381
+ 20,
2382
+ 21,
2383
+ 22,
2384
+ 23,
2385
+ 24,
2386
+ 25,
2387
+ 26,
2388
+ 27,
2389
+ 28,
2390
+ 29,
2391
+ 30,
2392
+ 31,
2393
+ 32,
2394
+ 33,
2395
+ 34,
2396
+ 35,
2397
+ 36,
2398
+ 37,
2399
+ 38,
2400
+ 39,
2401
+ 40,
2402
+ 41,
2403
+ 42,
2404
+ 43,
2405
+ 44,
2406
+ 45,
2407
+ 46,
2408
+ 47,
2409
+ 48,
2410
+ 49
2411
+ ],
2412
+ "modality_keys": [
2413
+ "left_arm_pose",
2414
+ "left_arm_gripper",
2415
+ "right_arm_pose",
2416
+ "right_arm_gripper"
2417
+ ],
2418
+ "sin_cos_embedding_keys": null,
2419
+ "mean_std_embedding_keys": null,
2420
+ "min_max_embedding_keys": null,
2421
+ "pass_through_keys": null,
2422
+ "action_configs": [
2423
+ {
2424
+ "rep": "REL_XYZ_ROT6D",
2425
+ "type": "EEF",
2426
+ "format": "XYZ_ROT6D",
2427
+ "state_key": "left_arm_pose",
2428
+ "input_rotation_format": "quat",
2429
+ "input_quat_order": "wxyz",
2430
+ "reference_rotation_format": "quat",
2431
+ "reference_quat_order": "wxyz",
2432
+ "translation_scaling_key": null,
2433
+ "rotation_scaling_key": null,
2434
+ "hold_through_clutch": false,
2435
+ "normalization_type": "percentile"
2436
+ },
2437
+ {
2438
+ "rep": "ABSOLUTE",
2439
+ "type": "NON_EEF",
2440
+ "format": "DEFAULT",
2441
+ "state_key": null,
2442
+ "input_rotation_format": "quat",
2443
+ "input_quat_order": "xyzw",
2444
+ "reference_rotation_format": "rot6d",
2445
+ "reference_quat_order": "xyzw",
2446
+ "translation_scaling_key": null,
2447
+ "rotation_scaling_key": null,
2448
+ "hold_through_clutch": false,
2449
+ "normalization_type": "percentile"
2450
+ },
2451
+ {
2452
+ "rep": "REL_XYZ_ROT6D",
2453
+ "type": "EEF",
2454
+ "format": "XYZ_ROT6D",
2455
+ "state_key": "right_arm_pose",
2456
+ "input_rotation_format": "quat",
2457
+ "input_quat_order": "wxyz",
2458
+ "reference_rotation_format": "quat",
2459
+ "reference_quat_order": "wxyz",
2460
+ "translation_scaling_key": null,
2461
+ "rotation_scaling_key": null,
2462
+ "hold_through_clutch": false,
2463
+ "normalization_type": "percentile"
2464
+ },
2465
+ {
2466
+ "rep": "ABSOLUTE",
2467
+ "type": "NON_EEF",
2468
+ "format": "DEFAULT",
2469
+ "state_key": null,
2470
+ "input_rotation_format": "quat",
2471
+ "input_quat_order": "xyzw",
2472
+ "reference_rotation_format": "rot6d",
2473
+ "reference_quat_order": "xyzw",
2474
+ "translation_scaling_key": null,
2475
+ "rotation_scaling_key": null,
2476
+ "hold_through_clutch": false,
2477
+ "normalization_type": "percentile"
2478
+ }
2479
+ ]
2480
+ },
2481
+ "language": {
2482
+ "delta_indices": [
2483
+ 0
2484
+ ],
2485
+ "modality_keys": [
2486
+ "task"
2487
+ ],
2488
+ "sin_cos_embedding_keys": null,
2489
+ "mean_std_embedding_keys": null,
2490
+ "min_max_embedding_keys": null,
2491
+ "pass_through_keys": null,
2492
+ "action_configs": null
2493
+ }
2494
+ },
2495
+ "ucb_dvrk": {
2496
+ "video": {
2497
+ "delta_indices": [
2498
+ 0
2499
+ ],
2500
+ "modality_keys": [
2501
+ "camera_left"
2502
+ ],
2503
+ "sin_cos_embedding_keys": null,
2504
+ "mean_std_embedding_keys": null,
2505
+ "min_max_embedding_keys": null,
2506
+ "pass_through_keys": null,
2507
+ "action_configs": null
2508
+ },
2509
+ "state": {
2510
+ "delta_indices": [
2511
+ 0
2512
+ ],
2513
+ "modality_keys": [
2514
+ "psm1_joints",
2515
+ "psm1_gripper",
2516
+ "psm2_joints",
2517
+ "psm2_gripper",
2518
+ "psm1_pose",
2519
+ "psm2_pose"
2520
+ ],
2521
+ "sin_cos_embedding_keys": null,
2522
+ "mean_std_embedding_keys": [
2523
+ "psm1_joints",
2524
+ "psm1_gripper",
2525
+ "psm2_joints",
2526
+ "psm2_gripper"
2527
+ ],
2528
+ "min_max_embedding_keys": null,
2529
+ "pass_through_keys": [
2530
+ "psm1_pose",
2531
+ "psm2_pose"
2532
+ ],
2533
+ "action_configs": null
2534
+ },
2535
+ "action": {
2536
+ "delta_indices": [
2537
+ 0,
2538
+ 1,
2539
+ 2,
2540
+ 3,
2541
+ 4,
2542
+ 5,
2543
+ 6,
2544
+ 7,
2545
+ 8,
2546
+ 9,
2547
+ 10,
2548
+ 11,
2549
+ 12,
2550
+ 13,
2551
+ 14,
2552
+ 15,
2553
+ 16,
2554
+ 17,
2555
+ 18,
2556
+ 19,
2557
+ 20,
2558
+ 21,
2559
+ 22,
2560
+ 23,
2561
+ 24,
2562
+ 25,
2563
+ 26,
2564
+ 27,
2565
+ 28,
2566
+ 29,
2567
+ 30,
2568
+ 31,
2569
+ 32,
2570
+ 33,
2571
+ 34,
2572
+ 35,
2573
+ 36,
2574
+ 37,
2575
+ 38,
2576
+ 39,
2577
+ 40,
2578
+ 41,
2579
+ 42,
2580
+ 43,
2581
+ 44,
2582
+ 45,
2583
+ 46,
2584
+ 47,
2585
+ 48,
2586
+ 49
2587
+ ],
2588
+ "modality_keys": [
2589
+ "psm1_pose",
2590
+ "psm1_gripper",
2591
+ "psm2_pose",
2592
+ "psm2_gripper"
2593
+ ],
2594
+ "sin_cos_embedding_keys": null,
2595
+ "mean_std_embedding_keys": null,
2596
+ "min_max_embedding_keys": null,
2597
+ "pass_through_keys": null,
2598
+ "action_configs": [
2599
+ {
2600
+ "rep": "REL_XYZ_ROT6D",
2601
+ "type": "EEF",
2602
+ "format": "XYZ_ROT6D",
2603
+ "state_key": "psm1_pose",
2604
+ "input_rotation_format": "quat",
2605
+ "input_quat_order": "xyzw",
2606
+ "reference_rotation_format": "quat",
2607
+ "reference_quat_order": "xyzw",
2608
+ "translation_scaling_key": null,
2609
+ "rotation_scaling_key": null,
2610
+ "hold_through_clutch": false,
2611
+ "normalization_type": "percentile"
2612
+ },
2613
+ {
2614
+ "rep": "ABSOLUTE",
2615
+ "type": "NON_EEF",
2616
+ "format": "DEFAULT",
2617
+ "state_key": null,
2618
+ "input_rotation_format": "quat",
2619
+ "input_quat_order": "xyzw",
2620
+ "reference_rotation_format": "rot6d",
2621
+ "reference_quat_order": "xyzw",
2622
+ "translation_scaling_key": null,
2623
+ "rotation_scaling_key": null,
2624
+ "hold_through_clutch": false,
2625
+ "normalization_type": "percentile"
2626
+ },
2627
+ {
2628
+ "rep": "REL_XYZ_ROT6D",
2629
+ "type": "EEF",
2630
+ "format": "XYZ_ROT6D",
2631
+ "state_key": "psm2_pose",
2632
+ "input_rotation_format": "quat",
2633
+ "input_quat_order": "xyzw",
2634
+ "reference_rotation_format": "quat",
2635
+ "reference_quat_order": "xyzw",
2636
+ "translation_scaling_key": null,
2637
+ "rotation_scaling_key": null,
2638
+ "hold_through_clutch": false,
2639
+ "normalization_type": "percentile"
2640
+ },
2641
+ {
2642
+ "rep": "ABSOLUTE",
2643
+ "type": "NON_EEF",
2644
+ "format": "DEFAULT",
2645
+ "state_key": null,
2646
+ "input_rotation_format": "quat",
2647
+ "input_quat_order": "xyzw",
2648
+ "reference_rotation_format": "rot6d",
2649
+ "reference_quat_order": "xyzw",
2650
+ "translation_scaling_key": null,
2651
+ "rotation_scaling_key": null,
2652
+ "hold_through_clutch": false,
2653
+ "normalization_type": "percentile"
2654
+ }
2655
+ ]
2656
+ },
2657
+ "language": {
2658
+ "delta_indices": [
2659
+ 0
2660
+ ],
2661
+ "modality_keys": [
2662
+ "task"
2663
+ ],
2664
+ "sin_cos_embedding_keys": null,
2665
+ "mean_std_embedding_keys": null,
2666
+ "min_max_embedding_keys": null,
2667
+ "pass_through_keys": null,
2668
+ "action_configs": null
2669
+ }
2670
+ },
2671
+ "jhu_imerse_star_il": {
2672
+ "video": {
2673
+ "delta_indices": [
2674
+ 0
2675
+ ],
2676
+ "modality_keys": [
2677
+ "endoscope_left",
2678
+ "wrist_left"
2679
+ ],
2680
+ "sin_cos_embedding_keys": null,
2681
+ "mean_std_embedding_keys": null,
2682
+ "min_max_embedding_keys": null,
2683
+ "pass_through_keys": null,
2684
+ "action_configs": null
2685
+ },
2686
+ "state": {
2687
+ "delta_indices": [
2688
+ 0
2689
+ ],
2690
+ "modality_keys": [
2691
+ "kuka_joint_pos",
2692
+ "endo360_joint_pos",
2693
+ "kuka_pose"
2694
+ ],
2695
+ "sin_cos_embedding_keys": null,
2696
+ "mean_std_embedding_keys": [
2697
+ "kuka_joint_pos",
2698
+ "endo360_joint_pos"
2699
+ ],
2700
+ "min_max_embedding_keys": null,
2701
+ "pass_through_keys": [
2702
+ "kuka_pose"
2703
+ ],
2704
+ "action_configs": null
2705
+ },
2706
+ "action": {
2707
+ "delta_indices": [
2708
+ 1,
2709
+ 2,
2710
+ 3,
2711
+ 4,
2712
+ 5,
2713
+ 6,
2714
+ 7,
2715
+ 8,
2716
+ 9,
2717
+ 10,
2718
+ 11,
2719
+ 12,
2720
+ 13,
2721
+ 14,
2722
+ 15,
2723
+ 16,
2724
+ 17,
2725
+ 18,
2726
+ 19,
2727
+ 20,
2728
+ 21,
2729
+ 22,
2730
+ 23,
2731
+ 24,
2732
+ 25,
2733
+ 26,
2734
+ 27,
2735
+ 28,
2736
+ 29,
2737
+ 30,
2738
+ 31,
2739
+ 32,
2740
+ 33,
2741
+ 34,
2742
+ 35,
2743
+ 36,
2744
+ 37,
2745
+ 38,
2746
+ 39,
2747
+ 40,
2748
+ 41,
2749
+ 42,
2750
+ 43,
2751
+ 44,
2752
+ 45,
2753
+ 46,
2754
+ 47,
2755
+ 48,
2756
+ 49,
2757
+ 50
2758
+ ],
2759
+ "modality_keys": [
2760
+ "kuka_pose"
2761
+ ],
2762
+ "sin_cos_embedding_keys": null,
2763
+ "mean_std_embedding_keys": null,
2764
+ "min_max_embedding_keys": null,
2765
+ "pass_through_keys": null,
2766
+ "action_configs": [
2767
+ {
2768
+ "rep": "REL_XYZ_ROT6D",
2769
+ "type": "EEF",
2770
+ "format": "XYZ_ROT6D",
2771
+ "state_key": "kuka_pose",
2772
+ "input_rotation_format": "quat",
2773
+ "input_quat_order": "xyzw",
2774
+ "reference_rotation_format": "quat",
2775
+ "reference_quat_order": "xyzw",
2776
+ "translation_scaling_key": null,
2777
+ "rotation_scaling_key": null,
2778
+ "hold_through_clutch": false,
2779
+ "normalization_type": "percentile"
2780
+ }
2781
+ ]
2782
+ },
2783
+ "language": {
2784
+ "delta_indices": [
2785
+ 0
2786
+ ],
2787
+ "modality_keys": [
2788
+ "annotation.human.task_description"
2789
+ ],
2790
+ "sin_cos_embedding_keys": null,
2791
+ "mean_std_embedding_keys": null,
2792
+ "min_max_embedding_keys": null,
2793
+ "pass_through_keys": null,
2794
+ "action_configs": null
2795
+ }
2796
+ }
2797
+ },
2798
+ "image_crop_size": null,
2799
+ "image_target_size": null,
2800
+ "use_albumentations": true,
2801
+ "random_rotation_angle": null,
2802
+ "color_jitter_params": null,
2803
+ "shortest_image_edge": 256,
2804
+ "crop_fraction": 0.95,
2805
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
2806
+ "model_type": "eagle",
2807
+ "formalize_language": true,
2808
+ "max_state_dim": 128,
2809
+ "max_action_dim": 128,
2810
+ "max_action_horizon": 50,
2811
+ "use_percentiles": false,
2812
+ "clip_outliers": true,
2813
+ "apply_sincos_state_encoding": true,
2814
+ "use_relative_action": true
2815
+ }
2816
+ }
checkpoint-85000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e59c0c3e684929cacc05fd411495d59ad80ff5d61306f24b70d77512f5b394bd
3
+ size 14645
checkpoint-85000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:630cc6620cc5b98617df64ae7952180f1aa11c7e17c08dfc74de5339b8a4c407
3
+ size 1465
checkpoint-85000/statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-85000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-85000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46d07ebf9c23abaad7dedb34e5f24bcf194b2e63cd9836ca05f6a3b638164d87
3
+ size 5841