Dongkkka commited on
Commit
1ff88f0
·
verified ·
1 Parent(s): 5ea9c05

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 40,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": false,
5
+ "architectures": [
6
+ "Gr00tN1d7"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_trainable_params_fp32": true,
12
+ "color_jitter_params": {
13
+ "brightness": 0.3,
14
+ "contrast": 0.4,
15
+ "hue": 0.08,
16
+ "saturation": 0.5
17
+ },
18
+ "crop_fraction": 0.95,
19
+ "diffusion_model_cfg": {
20
+ "attention_head_dim": 48,
21
+ "dropout": 0.2,
22
+ "final_dropout": true,
23
+ "interleave_self_attention": true,
24
+ "norm_type": "ada_norm",
25
+ "num_attention_heads": 32,
26
+ "num_layers": 32,
27
+ "output_dim": 1024,
28
+ "positional_embeddings": null
29
+ },
30
+ "dtype": "float32",
31
+ "exclude_state": false,
32
+ "formalize_language": true,
33
+ "hidden_size": 1024,
34
+ "image_crop_size": [
35
+ 230,
36
+ 230
37
+ ],
38
+ "image_target_size": [
39
+ 256,
40
+ 256
41
+ ],
42
+ "letter_box_transform": false,
43
+ "load_bf16": false,
44
+ "max_action_dim": 132,
45
+ "max_num_embodiments": 32,
46
+ "max_seq_len": 1024,
47
+ "max_state_dim": 132,
48
+ "model_dtype": "bfloat16",
49
+ "model_name": "nvidia/Cosmos-Reason2-2B",
50
+ "model_type": "Gr00tN1d7",
51
+ "noise_beta_alpha": 1.5,
52
+ "noise_beta_beta": 1.0,
53
+ "noise_s": 0.999,
54
+ "num_inference_timesteps": 4,
55
+ "num_timestep_buckets": 1000,
56
+ "random_history_crop": true,
57
+ "random_rotation_angle": 0,
58
+ "reproject_vision": false,
59
+ "rtc_ramp_rate": 6.0,
60
+ "select_layer": 16,
61
+ "shortest_image_edge": 256,
62
+ "state_dropout_prob": 0.2,
63
+ "state_gaussian_noise_std": 0.0,
64
+ "transformers_version": "4.57.3",
65
+ "tune_diffusion_model": true,
66
+ "tune_linear": true,
67
+ "tune_llm": false,
68
+ "tune_projector": true,
69
+ "tune_top_llm_layers": 0,
70
+ "tune_top_visual_layers": 0,
71
+ "tune_visual": false,
72
+ "tune_vlln": true,
73
+ "use_albumentations": true,
74
+ "use_alternate_vl_dit": true,
75
+ "use_flash_attention": true,
76
+ "use_future_tokens": false,
77
+ "use_mean_std": false,
78
+ "use_percentiles": true,
79
+ "use_vl_self_attention": true,
80
+ "use_vlln": true,
81
+ "vl_self_attention_cfg": {
82
+ "attention_head_dim": 64,
83
+ "dropout": 0.2,
84
+ "final_dropout": true,
85
+ "num_attention_heads": 32,
86
+ "num_layers": 4,
87
+ "positional_embeddings": null
88
+ }
89
+ }
embodiment_id.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "oxe_droid": 17,
4
+ "oxe_fractal": 18,
5
+ "oxe_language_table": 19,
6
+ "oxe_bridge": 20,
7
+ "unknown": 22,
8
+ "gr1_unified": 20,
9
+ "agibot": 26,
10
+ "sim_behavior_r1_pro": 23,
11
+ "xdof": 24,
12
+ "xdof_oss_data": 25,
13
+ "unitree_g1_full_body_with_waist_height_nav_cmd": 25,
14
+ "real_r1_pro_sharpa": 27,
15
+ "real_r1_pro_sharpa_add_view": 27,
16
+ "real_r1_pro_sharpa_relative_arm_joint": 26,
17
+ "real_r1_pro_sharpa_delta_eef": 26,
18
+ "real_r1_pro_sharpa_absolute_eef": 26,
19
+ "real_r1_pro_sharpa_meanstd": 26,
20
+ "real_r1_pro_sharpa_relative_eef": 26,
21
+ "real_r1_pro_sharpa_relative_eef_add_view": 26,
22
+ "real_r1_pro_sharpa_relative_eef_relative_hand": 26,
23
+ "real_r1_pro_sharpa_relative_eef_human": 26,
24
+ "real_r1_pro_sharpa_relative_eef_human_add_view": 26,
25
+ "real_r1_pro_sharpa_relative_eef_human_relative_hand": 26,
26
+ "real_r1_pro_sharpa_relative_eef_egodex": 26,
27
+ "real_r1_pro_sharpa_relative_eef_egodex_relative_hand": 26,
28
+ "real_r1_pro_sharpa_relative_eef_egodex_wrist_only": 26,
29
+ "real_r1_pro_sharpa_relative_eef_maxinsights": 26,
30
+ "real_r1_pro_sharpa_relative_eef_maxinsights_relative_hand": 26,
31
+ "real_r1_pro_sharpa_relative_eef_mecka": 26,
32
+ "real_r1_pro_sharpa_relative_eef_mecka_relative_hand": 26,
33
+ "real_g1_relative_eef_absolute_joints": 25,
34
+ "real_g1_relative_eef_absolute_joints_wrist_cam": 25,
35
+ "real_g1_relative_eef_relative_joints": 25,
36
+ "real_r1_pro_sharpa_relative_eef_relative_hand_relative_joint": 26,
37
+ "real_r1_pro_sharpa_relative_joint": 29,
38
+ "oxe_droid_relative_eef_relative_joint": 24,
39
+ "oxe_droid_relative_eef_relative_joint_swapped": 24,
40
+ "oxe_droid_relative_eef_relative_joint_upweight_z": 24,
41
+ "oxe_droid_relative_eef_relative_joint_upweight_z_swapped": 24,
42
+ "oxe_droid_relative_eef_relative_joint_3view": 24,
43
+ "oxe_droid_relative_eef_relative_joint_3view_swapped": 24,
44
+ "oxe_droid_relative_eef": 24,
45
+ "oxe_droid_joint_position_relative": 24,
46
+ "xdof_relative_eef_relative_joint": 27,
47
+ "xdof_relative_eef_relative_joint_subtask": 27,
48
+ "xdof_relative_eef": 27,
49
+ "xdof_relative_joint": 28,
50
+ "simpler_env_google": 0,
51
+ "simpler_env_widowx": 1,
52
+ "libero_sim": 2,
53
+ "droid_sim": 3,
54
+ "new_embodiment": 10
55
+ }
experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d7
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Cosmos-Reason2-2B
6
+ backbone_model_type: qwen
7
+ model_revision: null
8
+ tune_top_llm_layers: 0
9
+ tune_top_visual_layers: 0
10
+ backbone_embedding_dim: 2048
11
+ tune_llm: false
12
+ tune_visual: false
13
+ select_layer: 12
14
+ reproject_vision: false
15
+ use_flash_attention: true
16
+ load_bf16: false
17
+ backbone_trainable_params_fp32: true
18
+ image_crop_size:
19
+ - 230
20
+ - 230
21
+ image_target_size:
22
+ - 256
23
+ - 256
24
+ shortest_image_edge: null
25
+ crop_fraction: null
26
+ random_rotation_angle: null
27
+ color_jitter_params: null
28
+ use_albumentations_transforms: true
29
+ extra_augmentation_config: null
30
+ formalize_language: true
31
+ apply_sincos_state_encoding: false
32
+ use_percentiles: true
33
+ use_relative_action: true
34
+ max_state_dim: 132
35
+ max_action_dim: 132
36
+ action_horizon: 40
37
+ hidden_size: 1024
38
+ input_embedding_dim: 1536
39
+ state_history_length: 1
40
+ add_pos_embed: true
41
+ attn_dropout: 0.2
42
+ use_vlln: true
43
+ max_seq_len: 1024
44
+ use_alternate_vl_dit: true
45
+ attend_text_every_n_blocks: 2
46
+ diffusion_model_cfg:
47
+ positional_embeddings: null
48
+ num_layers: 16
49
+ num_attention_heads: 32
50
+ attention_head_dim: 48
51
+ norm_type: ada_norm
52
+ dropout: 0.2
53
+ final_dropout: true
54
+ output_dim: 1024
55
+ interleave_self_attention: true
56
+ num_inference_timesteps: 4
57
+ noise_beta_alpha: 1.5
58
+ noise_beta_beta: 1.0
59
+ noise_s: 0.999
60
+ num_timestep_buckets: 1000
61
+ tune_projector: true
62
+ tune_diffusion_model: true
63
+ tune_vlln: true
64
+ state_dropout_prob: 0.2
65
+ exclude_state: false
66
+ use_mean_std: false
67
+ max_num_embodiments: 32
68
+ data:
69
+ datasets:
70
+ - dataset_paths:
71
+ - /data/datasets/Task_0013_clean_cafe_table_lerobot_per_subtask
72
+ embodiment_tag: new_embodiment
73
+ mix_ratio: 1.0
74
+ dataset_type: physical_embodiment
75
+ val_dataset_path: null
76
+ modality_configs:
77
+ new_embodiment:
78
+ video:
79
+ delta_indices:
80
+ - 0
81
+ modality_keys:
82
+ - cam_left_head
83
+ - cam_left_wrist
84
+ - cam_right_wrist
85
+ sin_cos_embedding_keys: null
86
+ mean_std_embedding_keys: null
87
+ action_configs: null
88
+ state:
89
+ delta_indices:
90
+ - 0
91
+ modality_keys:
92
+ - arm_left
93
+ - arm_right
94
+ - head
95
+ - lift
96
+ - odometry
97
+ sin_cos_embedding_keys: null
98
+ mean_std_embedding_keys: null
99
+ action_configs: null
100
+ action:
101
+ delta_indices:
102
+ - 0
103
+ - 1
104
+ - 2
105
+ - 3
106
+ - 4
107
+ - 5
108
+ - 6
109
+ - 7
110
+ - 8
111
+ - 9
112
+ - 10
113
+ - 11
114
+ - 12
115
+ - 13
116
+ - 14
117
+ - 15
118
+ modality_keys:
119
+ - arm_left
120
+ - arm_right
121
+ - head
122
+ - lift
123
+ - odometry
124
+ sin_cos_embedding_keys: null
125
+ mean_std_embedding_keys: null
126
+ action_configs:
127
+ - rep: ABSOLUTE
128
+ type: NON_EEF
129
+ format: DEFAULT
130
+ state_key: null
131
+ - rep: ABSOLUTE
132
+ type: NON_EEF
133
+ format: DEFAULT
134
+ state_key: null
135
+ - rep: ABSOLUTE
136
+ type: NON_EEF
137
+ format: DEFAULT
138
+ state_key: null
139
+ - rep: ABSOLUTE
140
+ type: NON_EEF
141
+ format: DEFAULT
142
+ state_key: null
143
+ - rep: ABSOLUTE
144
+ type: NON_EEF
145
+ format: DEFAULT
146
+ state_key: null
147
+ language:
148
+ delta_indices:
149
+ - 0
150
+ modality_keys:
151
+ - annotation.human.primitive_instruction
152
+ sin_cos_embedding_keys: null
153
+ mean_std_embedding_keys: null
154
+ action_configs: null
155
+ download_cache: false
156
+ shard_size: 1024
157
+ episode_sampling_rate: 0.1
158
+ num_shards_per_epoch: 100000
159
+ override_pretraining_statistics: true
160
+ mode: single_turn
161
+ random_chop: 0.0
162
+ mock_dataset_mode: false
163
+ shuffle: true
164
+ seed: 42
165
+ multiprocessing_context: fork
166
+ allow_padding: false
167
+ subsample_ratio: 1.0
168
+ image_crop_size:
169
+ - 244
170
+ - 244
171
+ image_target_size:
172
+ - 224
173
+ - 224
174
+ video_backend: torchcodec
175
+ training:
176
+ output_dir: /data/checkpoints/groot_stage2_persubtask_percam_1k_0515_0303_0515_0358
177
+ experiment_name: groot_stage2_persubtask_percam_1k_0515_0303_0515_0358
178
+ max_steps: 200000
179
+ global_batch_size: 4
180
+ batch_size: null
181
+ gradient_accumulation_steps: 1
182
+ learning_rate: 0.0001
183
+ lr_scheduler_type: cosine
184
+ weight_decay: 1.0e-05
185
+ warmup_ratio: 0.05
186
+ warmup_steps: 0
187
+ max_grad_norm: 1.0
188
+ optim: adamw_torch
189
+ start_from_checkpoint: /data/base_model
190
+ skip_weight_loading: false
191
+ tf32: true
192
+ fp16: false
193
+ bf16: true
194
+ eval_bf16: true
195
+ logging_steps: 10
196
+ save_steps: 50000
197
+ save_total_limit: 99
198
+ save_vl_model: false
199
+ save_only_model: false
200
+ keep_only_at_steps: 100000,150000,200000
201
+ upload_checkpoints: false
202
+ upload_every: 1000
203
+ upload_last_n_checkpoints: 5
204
+ max_concurrent_uploads: 2
205
+ eval_strategy: 'no'
206
+ eval_steps: 500
207
+ eval_set_split_ratio: 0.1
208
+ eval_batch_size: 2
209
+ save_best_eval_metric_name: ''
210
+ save_best_eval_metric_greater_is_better: true
211
+ deepspeed_stage: 2
212
+ gradient_checkpointing: false
213
+ transformers_trust_remote_code: true
214
+ transformers_local_files_only: false
215
+ transformers_cache_dir: null
216
+ transformers_access_token: null
217
+ use_ddp: false
218
+ ddp_bucket_cap_mb: 100
219
+ num_gpus: 1
220
+ dataloader_num_workers: 4
221
+ remove_unused_columns: false
222
+ use_wandb: false
223
+ wandb_project: finetune-gr00t-n1d7
224
+ enable_profiling: false
225
+ max_retries: 3
226
+ assert_loss_less_than: null
227
+ add_rl_callback: false
228
+ enable_open_loop_eval: false
229
+ open_loop_eval_traj_ids:
230
+ - 0
231
+ open_loop_eval_steps_per_traj: 100
232
+ open_loop_eval_plot_indices: null
233
+ max_steps: 200000
234
+ save_steps: 50000
experiment_cfg/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - /data/datasets/Task_0013_clean_cafe_table_lerobot_per_subtask
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: new_embodiment
10
+ mix_ratio: 1.0
11
+ val_dataset_path: null
12
+ download_cache: false
13
+ episode_sampling_rate: 0.1
14
+ image_crop_size:
15
+ - 244
16
+ - 244
17
+ image_target_size:
18
+ - 224
19
+ - 224
20
+ mock_dataset_mode: false
21
+ modality_configs:
22
+ new_embodiment:
23
+ action: !!python/object:gr00t.data.types.ModalityConfig
24
+ action_configs:
25
+ - !!python/object:gr00t.data.types.ActionConfig
26
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
27
+ - default
28
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
29
+ - absolute
30
+ state_key: null
31
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
32
+ - non_eef
33
+ - !!python/object:gr00t.data.types.ActionConfig
34
+ format: *id001
35
+ rep: *id002
36
+ state_key: null
37
+ type: *id003
38
+ - !!python/object:gr00t.data.types.ActionConfig
39
+ format: *id001
40
+ rep: *id002
41
+ state_key: null
42
+ type: *id003
43
+ - !!python/object:gr00t.data.types.ActionConfig
44
+ format: *id001
45
+ rep: *id002
46
+ state_key: null
47
+ type: *id003
48
+ - !!python/object:gr00t.data.types.ActionConfig
49
+ format: *id001
50
+ rep: *id002
51
+ state_key: null
52
+ type: *id003
53
+ delta_indices:
54
+ - 0
55
+ - 1
56
+ - 2
57
+ - 3
58
+ - 4
59
+ - 5
60
+ - 6
61
+ - 7
62
+ - 8
63
+ - 9
64
+ - 10
65
+ - 11
66
+ - 12
67
+ - 13
68
+ - 14
69
+ - 15
70
+ mean_std_embedding_keys: null
71
+ modality_keys:
72
+ - arm_left
73
+ - arm_right
74
+ - head
75
+ - lift
76
+ - odometry
77
+ sin_cos_embedding_keys: null
78
+ language: !!python/object:gr00t.data.types.ModalityConfig
79
+ action_configs: null
80
+ delta_indices:
81
+ - 0
82
+ mean_std_embedding_keys: null
83
+ modality_keys:
84
+ - annotation.human.primitive_instruction
85
+ sin_cos_embedding_keys: null
86
+ state: !!python/object:gr00t.data.types.ModalityConfig
87
+ action_configs: null
88
+ delta_indices:
89
+ - 0
90
+ mean_std_embedding_keys: null
91
+ modality_keys:
92
+ - arm_left
93
+ - arm_right
94
+ - head
95
+ - lift
96
+ - odometry
97
+ sin_cos_embedding_keys: null
98
+ video: !!python/object:gr00t.data.types.ModalityConfig
99
+ action_configs: null
100
+ delta_indices:
101
+ - 0
102
+ mean_std_embedding_keys: null
103
+ modality_keys:
104
+ - cam_left_head
105
+ - cam_left_wrist
106
+ - cam_right_wrist
107
+ sin_cos_embedding_keys: null
108
+ mode: single_turn
109
+ multiprocessing_context: fork
110
+ num_shards_per_epoch: 100000
111
+ override_pretraining_statistics: true
112
+ random_chop: 0.0
113
+ seed: 42
114
+ shard_size: 1024
115
+ shuffle: true
116
+ subsample_ratio: 1.0
117
+ video_backend: torchcodec
118
+ load_config_path: null
119
+ model: !!python/object:gr00t.configs.model.gr00t_n1d7.Gr00tN1d7Config
120
+ _attn_implementation_internal: null
121
+ _commit_hash: null
122
+ _name_or_path: ''
123
+ _output_attentions: false
124
+ add_cross_attention: false
125
+ architectures: null
126
+ backbone_trainable_params_fp32: true
127
+ bad_words_ids: null
128
+ begin_suppress_tokens: null
129
+ bos_token_id: null
130
+ chunk_size_feed_forward: 0
131
+ color_jitter_params: null
132
+ cross_attention_hidden_size: null
133
+ decoder_start_token_id: null
134
+ diffusion_model_cfg:
135
+ attention_head_dim: 48
136
+ dropout: 0.2
137
+ final_dropout: true
138
+ interleave_self_attention: true
139
+ norm_type: ada_norm
140
+ num_attention_heads: 32
141
+ num_layers: 16
142
+ output_dim: 1024
143
+ positional_embeddings: null
144
+ diversity_penalty: 0.0
145
+ do_sample: false
146
+ dtype: null
147
+ early_stopping: false
148
+ encoder_no_repeat_ngram_size: 0
149
+ eos_token_id: null
150
+ exponential_decay_length_penalty: null
151
+ extra_augmentation_config: null
152
+ finetuning_task: null
153
+ forced_bos_token_id: null
154
+ forced_eos_token_id: null
155
+ id2label:
156
+ 0: LABEL_0
157
+ 1: LABEL_1
158
+ is_decoder: false
159
+ is_encoder_decoder: false
160
+ label2id:
161
+ LABEL_0: 0
162
+ LABEL_1: 1
163
+ length_penalty: 1.0
164
+ load_bf16: false
165
+ max_length: 20
166
+ min_length: 0
167
+ model_name: nvidia/Cosmos-Reason2-2B
168
+ no_repeat_ngram_size: 0
169
+ num_beam_groups: 1
170
+ num_beams: 1
171
+ num_return_sequences: 1
172
+ output_hidden_states: false
173
+ output_scores: false
174
+ pad_token_id: null
175
+ prefix: null
176
+ problem_type: null
177
+ pruned_heads: {}
178
+ random_rotation_angle: null
179
+ remove_invalid_values: false
180
+ repetition_penalty: 1.0
181
+ reproject_vision: false
182
+ return_dict: true
183
+ return_dict_in_generate: false
184
+ sep_token_id: null
185
+ state_dropout_prob: 0.2
186
+ suppress_tokens: null
187
+ task_specific_params: null
188
+ temperature: 1.0
189
+ tf_legacy_loss: false
190
+ tie_encoder_decoder: false
191
+ tie_word_embeddings: true
192
+ tokenizer_class: null
193
+ top_k: 50
194
+ top_p: 1.0
195
+ torchscript: false
196
+ transformers_version: null
197
+ tune_diffusion_model: true
198
+ tune_llm: false
199
+ tune_projector: true
200
+ tune_top_llm_layers: 0
201
+ tune_top_visual_layers: 0
202
+ tune_visual: false
203
+ typical_p: 1.0
204
+ use_bfloat16: false
205
+ use_relative_action: true
206
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
207
+ add_rl_callback: false
208
+ assert_loss_less_than: null
209
+ batch_size: null
210
+ bf16: true
211
+ dataloader_num_workers: 4
212
+ ddp_bucket_cap_mb: 100
213
+ deepspeed_stage: 2
214
+ enable_open_loop_eval: false
215
+ enable_profiling: false
216
+ eval_batch_size: 2
217
+ eval_bf16: true
218
+ eval_set_split_ratio: 0.1
219
+ eval_steps: 500
220
+ eval_strategy: 'no'
221
+ experiment_name: groot_stage2_persubtask_percam_1k_0515_0303_0515_0358
222
+ fp16: false
223
+ global_batch_size: 4
224
+ gradient_accumulation_steps: 1
225
+ gradient_checkpointing: false
226
+ keep_only_at_steps: 100000,150000,200000
227
+ learning_rate: 0.0001
228
+ logging_steps: 10
229
+ lr_scheduler_type: cosine
230
+ max_concurrent_uploads: 2
231
+ max_grad_norm: 1.0
232
+ max_retries: 3
233
+ max_steps: 200000
234
+ num_gpus: 1
235
+ open_loop_eval_plot_indices: null
236
+ open_loop_eval_steps_per_traj: 100
237
+ open_loop_eval_traj_ids:
238
+ - 0
239
+ optim: adamw_torch
240
+ output_dir: /data/checkpoints/groot_stage2_persubtask_percam_1k_0515_0303_0515_0358
241
+ remove_unused_columns: false
242
+ save_best_eval_metric_greater_is_better: true
243
+ save_best_eval_metric_name: ''
244
+ save_only_model: false
245
+ save_steps: 50000
246
+ save_total_limit: 99
247
+ save_vl_model: false
248
+ skip_weight_loading: false
249
+ start_from_checkpoint: /data/base_model
250
+ tf32: true
251
+ transformers_access_token: null
252
+ transformers_cache_dir: null
253
+ transformers_local_files_only: false
254
+ transformers_trust_remote_code: true
255
+ upload_checkpoints: false
256
+ upload_every: 1000
257
+ upload_last_n_checkpoints: 5
258
+ use_ddp: false
259
+ use_wandb: false
260
+ wandb_project: finetune-gr00t-n1d7
261
+ warmup_ratio: 0.05
262
+ warmup_steps: 0
263
+ weight_decay: 1.0e-05
experiment_cfg/dataset_statistics.json ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "state": {
4
+ "arm_left": {
5
+ "min": [
6
+ -0.4281843602657318,
7
+ -2.3968450477696024e-05,
8
+ -1.1873011589050293,
9
+ -2.303715467453003,
10
+ -1.649592638015747,
11
+ -0.6620565056800842,
12
+ -1.5422195196151733,
13
+ 0.0015339808305725455
14
+ ],
15
+ "max": [
16
+ 1.0800782442092896,
17
+ 0.9176201224327087,
18
+ 0.2563665509223938,
19
+ -0.5655835270881653,
20
+ 0.6887214183807373,
21
+ 0.9628725647926331,
22
+ 1.3565888404846191,
23
+ 1.0246992111206055
24
+ ],
25
+ "mean": [
26
+ -0.05774058401584625,
27
+ 0.5573647022247314,
28
+ -0.19626182317733765,
29
+ -1.458626627922058,
30
+ 0.27720722556114197,
31
+ 0.17879338562488556,
32
+ -0.9416439533233643,
33
+ 0.2882844805717468
34
+ ],
35
+ "std": [
36
+ 0.44978252053260803,
37
+ 0.30842334032058716,
38
+ 0.17603261768817902,
39
+ 0.27855199575424194,
40
+ 0.37743106484413147,
41
+ 0.21612906455993652,
42
+ 0.7173371911048889,
43
+ 0.37325963377952576
44
+ ],
45
+ "q01": [
46
+ -0.4279686510562897,
47
+ 0.04364463046193123,
48
+ -0.8802974045276641,
49
+ -2.1261229610443113,
50
+ -1.306272646188736,
51
+ -0.34463407009840014,
52
+ -1.5416609048843384,
53
+ 0.06596117466688156
54
+ ],
55
+ "q99": [
56
+ 0.9178541719913481,
57
+ 0.8283616304397583,
58
+ 0.1767838601768014,
59
+ -0.6537470191717161,
60
+ 0.5714724206924424,
61
+ 0.7836709952354418,
62
+ 1.1422420465946197,
63
+ 1.0246992111206055
64
+ ]
65
+ },
66
+ "arm_right": {
67
+ "min": [
68
+ -0.40353283286094666,
69
+ -0.8960365056991577,
70
+ -0.2532506287097931,
71
+ -2.4420974254608154,
72
+ -0.713325023651123,
73
+ -0.38107436895370483,
74
+ -0.19898304343223572,
75
+ 0.02147573046386242
76
+ ],
77
+ "max": [
78
+ 0.843557596206665,
79
+ -0.11351457983255386,
80
+ 1.1473217010498047,
81
+ -1.1692049503326416,
82
+ 0.8832493424415588,
83
+ 0.7743846774101257,
84
+ 1.363911509513855,
85
+ 0.5430291891098022
86
+ ],
87
+ "mean": [
88
+ 0.08525058627128601,
89
+ -0.6664908528327942,
90
+ 0.24424459040164948,
91
+ -1.665743350982666,
92
+ -0.269589364528656,
93
+ -0.057081472128629684,
94
+ 1.2473891973495483,
95
+ 0.10683941841125488
96
+ ],
97
+ "std": [
98
+ 0.5257307887077332,
99
+ 0.24324843287467957,
100
+ 0.2773193120956421,
101
+ 0.3064912259578705,
102
+ 0.48827773332595825,
103
+ 0.18531718850135803,
104
+ 0.27892857789993286,
105
+ 0.08736545592546463
106
+ ],
107
+ "q01": [
108
+ -0.4034489393234253,
109
+ -0.8958447575569153,
110
+ -0.11350259184837341,
111
+ -2.312161021232605,
112
+ -0.7133130431175232,
113
+ -0.26229873299598694,
114
+ 0.25463706254959106,
115
+ 0.05062136426568031
116
+ ],
117
+ "q99": [
118
+ 0.7746483087539673,
119
+ -0.1707821533083921,
120
+ 0.885482121706008,
121
+ -1.3928306102752686,
122
+ 0.8034364682435989,
123
+ 0.5922776329517357,
124
+ 1.3637149333953857,
125
+ 0.4770680367946625
126
+ ]
127
+ },
128
+ "head": {
129
+ "min": [
130
+ 0.6504078507423401,
131
+ -0.34514567255973816
132
+ ],
133
+ "max": [
134
+ 0.9019806981086731,
135
+ 0.35281556844711304
136
+ ],
137
+ "mean": [
138
+ 0.7139426469802856,
139
+ 0.07013361155986786
140
+ ],
141
+ "std": [
142
+ 0.06295817345380739,
143
+ 0.1643301099538803
144
+ ],
145
+ "q01": [
146
+ 0.685689389705658,
147
+ -0.33900976181030273
148
+ ],
149
+ "q99": [
150
+ 0.9004467129707336,
151
+ 0.3466796576976776
152
+ ]
153
+ },
154
+ "lift": {
155
+ "min": [
156
+ -0.17427000403404236
157
+ ],
158
+ "max": [
159
+ -0.15741999447345734
160
+ ],
161
+ "mean": [
162
+ -0.16542737185955048
163
+ ],
164
+ "std": [
165
+ 0.003544580889865932
166
+ ],
167
+ "q01": [
168
+ -0.17379169523715973
169
+ ],
170
+ "q99": [
171
+ -0.15801000595092773
172
+ ]
173
+ },
174
+ "odometry": {
175
+ "min": [
176
+ -0.027980072423815727,
177
+ -0.005805037450045347,
178
+ -0.5454775094985962
179
+ ],
180
+ "max": [
181
+ 0.36404621601104736,
182
+ 0.008551697246730328,
183
+ 0.25096508860588074
184
+ ],
185
+ "mean": [
186
+ 0.02495281957089901,
187
+ 3.1950741686159745e-05,
188
+ -0.03993094712495804
189
+ ],
190
+ "std": [
191
+ 0.08207690715789795,
192
+ 0.000459484028397128,
193
+ 0.1336042881011963
194
+ ],
195
+ "q01": [
196
+ -0.0016634842148050663,
197
+ -0.00097130918642506,
198
+ -0.5038510084152221
199
+ ],
200
+ "q99": [
201
+ 0.31480172276496887,
202
+ 0.0017434144788421512,
203
+ 0.010901974430307722
204
+ ]
205
+ }
206
+ },
207
+ "action": {
208
+ "arm_left": {
209
+ "min": [
210
+ -0.4279686510562897,
211
+ 0.0,
212
+ -1.1873011589050293,
213
+ -2.3055732250213623,
214
+ -1.650563359260559,
215
+ -0.6642136573791504,
216
+ -1.541650652885437,
217
+ 0.0
218
+ ],
219
+ "max": [
220
+ 1.0814564228057861,
221
+ 0.9203884601593018,
222
+ 0.25770875811576843,
223
+ -0.5645049214363098,
224
+ 0.6887573599815369,
225
+ 0.9633399248123169,
226
+ 1.356039047241211,
227
+ 1.100000023841858
228
+ ],
229
+ "mean": [
230
+ -0.05774739384651184,
231
+ 0.5573772192001343,
232
+ -0.19626739621162415,
233
+ -1.4586329460144043,
234
+ 0.27720752358436584,
235
+ 0.17879951000213623,
236
+ -0.9416630268096924,
237
+ 0.3031612038612366
238
+ ],
239
+ "std": [
240
+ 0.4501131772994995,
241
+ 0.3084859549999237,
242
+ 0.17632049322128296,
243
+ 0.27914631366729736,
244
+ 0.3778475821018219,
245
+ 0.21659067273139954,
246
+ 0.717369556427002,
247
+ 0.40666264295578003
248
+ ],
249
+ "q01": [
250
+ -0.4279686510562897,
251
+ 0.04295146092772484,
252
+ -0.8835729360580444,
253
+ -2.1285341143608094,
254
+ -1.310019612312317,
255
+ -0.34821364283561707,
256
+ -1.541650652885437,
257
+ 0.0638110738992691
258
+ ],
259
+ "q99": [
260
+ 0.9190197205543501,
261
+ 0.8283376693725586,
262
+ 0.17794176936149597,
263
+ -0.653475821018219,
264
+ 0.5721748471260071,
265
+ 0.7838642001152039,
266
+ 1.1412817239761353,
267
+ 1.100000023841858
268
+ ]
269
+ },
270
+ "arm_right": {
271
+ "min": [
272
+ -0.4038594961166382,
273
+ -0.8958327770233154,
274
+ -0.25731149315834045,
275
+ -2.4451653957366943,
276
+ -0.7133010625839233,
277
+ -0.3876160681247711,
278
+ -0.2008066177368164,
279
+ 0.02238057181239128
280
+ ],
281
+ "max": [
282
+ 0.8457589149475098,
283
+ -0.11351457983255386,
284
+ 1.147417664527893,
285
+ -1.1673593521118164,
286
+ 0.8835729360580444,
287
+ 0.7746602892875671,
288
+ 1.3637045621871948,
289
+ 0.5488427877426147
290
+ ],
291
+ "mean": [
292
+ 0.08525016158819199,
293
+ -0.6664919853210449,
294
+ 0.2442476898431778,
295
+ -1.6657389402389526,
296
+ -0.26958319544792175,
297
+ -0.05708172917366028,
298
+ 1.2473909854888916,
299
+ 0.10692945867776871
300
+ ],
301
+ "std": [
302
+ 0.5259350538253784,
303
+ 0.24333280324935913,
304
+ 0.2775174379348755,
305
+ 0.30672529339790344,
306
+ 0.4884907603263855,
307
+ 0.1854144185781479,
308
+ 0.2789359986782074,
309
+ 0.0878915935754776
310
+ ],
311
+ "q01": [
312
+ -0.40343695878982544,
313
+ -0.8958327770233154,
314
+ -0.11350259184837341,
315
+ -2.3132429122924805,
316
+ -0.7133010625839233,
317
+ -0.26229873299598694,
318
+ 0.2546408176422119,
319
+ 0.04937863349914551
320
+ ],
321
+ "q99": [
322
+ 0.7746483087539673,
323
+ -0.1702718734741211,
324
+ 0.8853676986694309,
325
+ -1.3928545713424683,
326
+ 0.803805947303772,
327
+ 0.5938036859035476,
328
+ 1.3637045621871948,
329
+ 0.4779728651046753
330
+ ]
331
+ },
332
+ "head": {
333
+ "min": [
334
+ 0.6501029133796692,
335
+ -0.3499999940395355
336
+ ],
337
+ "max": [
338
+ 0.6951000094413757,
339
+ 0.3499999940395355
340
+ ],
341
+ "mean": [
342
+ 0.695157527923584,
343
+ 0.07115019112825394
344
+ ],
345
+ "std": [
346
+ 0.00155158154666312,
347
+ 0.16559593379497528
348
+ ],
349
+ "q01": [
350
+ 0.6951000094413757,
351
+ -0.3499999940395355
352
+ ],
353
+ "q99": [
354
+ 0.6951000094413757,
355
+ 0.3499999940395355
356
+ ]
357
+ },
358
+ "lift": {
359
+ "min": [
360
+ -0.17427000403404236
361
+ ],
362
+ "max": [
363
+ -0.15741999447345734
364
+ ],
365
+ "mean": [
366
+ -0.16543187201023102
367
+ ],
368
+ "std": [
369
+ 0.0035445999819783444
370
+ ],
371
+ "q01": [
372
+ -0.17379169523715973
373
+ ],
374
+ "q99": [
375
+ -0.15801000595092773
376
+ ]
377
+ },
378
+ "odometry": {
379
+ "min": [
380
+ 0.0,
381
+ 0.0,
382
+ -0.4951171875
383
+ ],
384
+ "max": [
385
+ 0.3203125,
386
+ 0.0,
387
+ 0.5
388
+ ],
389
+ "mean": [
390
+ 0.026779811829328537,
391
+ 0.0,
392
+ -0.0417805090546608
393
+ ],
394
+ "std": [
395
+ 0.08672492951154709,
396
+ 0.0,
397
+ 0.1375780999660492
398
+ ],
399
+ "q01": [
400
+ 0.0,
401
+ 0.0,
402
+ -0.4951171875
403
+ ],
404
+ "q99": [
405
+ 0.3138020932674408,
406
+ 0.0,
407
+ 0.0
408
+ ]
409
+ }
410
+ },
411
+ "relative_action": {}
412
+ }
413
+ }
experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d7",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Cosmos-Reason2-2B",
5
+ "backbone_model_type": "qwen",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 0,
8
+ "tune_top_visual_layers": 0,
9
+ "backbone_embedding_dim": 2048,
10
+ "tune_llm": false,
11
+ "tune_visual": false,
12
+ "select_layer": 16,
13
+ "reproject_vision": false,
14
+ "use_flash_attention": true,
15
+ "load_bf16": false,
16
+ "backbone_trainable_params_fp32": true,
17
+ "extra_augmentation_config": null,
18
+ "apply_sincos_state_encoding": false,
19
+ "use_percentiles": true,
20
+ "use_relative_action": false,
21
+ "max_state_dim": 132,
22
+ "max_action_dim": 132,
23
+ "action_horizon": 40,
24
+ "hidden_size": 1024,
25
+ "input_embedding_dim": 1536,
26
+ "state_history_length": 1,
27
+ "add_pos_embed": true,
28
+ "attn_dropout": 0.2,
29
+ "use_vlln": true,
30
+ "max_seq_len": 1024,
31
+ "use_alternate_vl_dit": true,
32
+ "attend_text_every_n_blocks": 2,
33
+ "diffusion_model_cfg": {
34
+ "attention_head_dim": 48,
35
+ "dropout": 0.2,
36
+ "final_dropout": true,
37
+ "interleave_self_attention": true,
38
+ "norm_type": "ada_norm",
39
+ "num_attention_heads": 32,
40
+ "num_layers": 32,
41
+ "output_dim": 1024,
42
+ "positional_embeddings": null
43
+ },
44
+ "num_inference_timesteps": 4,
45
+ "noise_beta_alpha": 1.5,
46
+ "noise_beta_beta": 1.0,
47
+ "noise_s": 0.999,
48
+ "num_timestep_buckets": 1000,
49
+ "tune_projector": true,
50
+ "tune_diffusion_model": true,
51
+ "tune_vlln": true,
52
+ "state_dropout_prob": 0.2,
53
+ "exclude_state": false,
54
+ "use_mean_std": false,
55
+ "max_num_embodiments": 32
56
+ }
experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fa792393bc1ac93582e9cb059686f28f9e564babe2462c73548c0f01fd6092a
3
+ size 4986649584
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:201d318145f6984359985d5ca0b020ed46e7d9c218a3169287154f3a4df2e6ed
3
+ size 4970792616
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0ffa16b3126a447f0dcedcd6f0a271d28f3e730ff76264b7e0ee09f774736aa
3
+ size 2618758696
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
processor_config.json ADDED
@@ -0,0 +1,1148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d7Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "real_g1_relative_eef_relative_joints": {
6
+ "video": {
7
+ "delta_indices": [
8
+ -20,
9
+ 0
10
+ ],
11
+ "modality_keys": [
12
+ "ego_view"
13
+ ],
14
+ "sin_cos_embedding_keys": null,
15
+ "mean_std_embedding_keys": null,
16
+ "action_configs": null
17
+ },
18
+ "state": {
19
+ "delta_indices": [
20
+ 0
21
+ ],
22
+ "modality_keys": [
23
+ "left_wrist_eef_9d",
24
+ "right_wrist_eef_9d",
25
+ "left_hand",
26
+ "right_hand",
27
+ "left_arm",
28
+ "right_arm",
29
+ "waist"
30
+ ],
31
+ "sin_cos_embedding_keys": null,
32
+ "mean_std_embedding_keys": null,
33
+ "action_configs": null
34
+ },
35
+ "action": {
36
+ "delta_indices": [
37
+ 0,
38
+ 1,
39
+ 2,
40
+ 3,
41
+ 4,
42
+ 5,
43
+ 6,
44
+ 7,
45
+ 8,
46
+ 9,
47
+ 10,
48
+ 11,
49
+ 12,
50
+ 13,
51
+ 14,
52
+ 15,
53
+ 16,
54
+ 17,
55
+ 18,
56
+ 19,
57
+ 20,
58
+ 21,
59
+ 22,
60
+ 23,
61
+ 24,
62
+ 25,
63
+ 26,
64
+ 27,
65
+ 28,
66
+ 29,
67
+ 30,
68
+ 31,
69
+ 32,
70
+ 33,
71
+ 34,
72
+ 35,
73
+ 36,
74
+ 37,
75
+ 38,
76
+ 39
77
+ ],
78
+ "modality_keys": [
79
+ "left_wrist_eef_9d",
80
+ "right_wrist_eef_9d",
81
+ "left_hand",
82
+ "right_hand",
83
+ "left_arm",
84
+ "right_arm",
85
+ "waist",
86
+ "base_height_command",
87
+ "navigate_command"
88
+ ],
89
+ "sin_cos_embedding_keys": null,
90
+ "mean_std_embedding_keys": null,
91
+ "action_configs": [
92
+ {
93
+ "rep": "RELATIVE",
94
+ "type": "EEF",
95
+ "format": "XYZ_ROT6D",
96
+ "state_key": "left_wrist_eef_9d"
97
+ },
98
+ {
99
+ "rep": "RELATIVE",
100
+ "type": "EEF",
101
+ "format": "XYZ_ROT6D",
102
+ "state_key": "right_wrist_eef_9d"
103
+ },
104
+ {
105
+ "rep": "ABSOLUTE",
106
+ "type": "NON_EEF",
107
+ "format": "DEFAULT",
108
+ "state_key": "left_hand"
109
+ },
110
+ {
111
+ "rep": "ABSOLUTE",
112
+ "type": "NON_EEF",
113
+ "format": "DEFAULT",
114
+ "state_key": "right_hand"
115
+ },
116
+ {
117
+ "rep": "RELATIVE",
118
+ "type": "NON_EEF",
119
+ "format": "DEFAULT",
120
+ "state_key": "left_arm"
121
+ },
122
+ {
123
+ "rep": "RELATIVE",
124
+ "type": "NON_EEF",
125
+ "format": "DEFAULT",
126
+ "state_key": "right_arm"
127
+ },
128
+ {
129
+ "rep": "ABSOLUTE",
130
+ "type": "NON_EEF",
131
+ "format": "DEFAULT",
132
+ "state_key": "waist"
133
+ },
134
+ {
135
+ "rep": "ABSOLUTE",
136
+ "type": "NON_EEF",
137
+ "format": "DEFAULT",
138
+ "state_key": "base_height_command"
139
+ },
140
+ {
141
+ "rep": "ABSOLUTE",
142
+ "type": "NON_EEF",
143
+ "format": "DEFAULT",
144
+ "state_key": "navigate_command"
145
+ }
146
+ ]
147
+ },
148
+ "language": {
149
+ "delta_indices": [
150
+ 0
151
+ ],
152
+ "modality_keys": [
153
+ "annotation.human.task_description"
154
+ ],
155
+ "sin_cos_embedding_keys": null,
156
+ "mean_std_embedding_keys": null,
157
+ "action_configs": null
158
+ }
159
+ },
160
+ "real_r1_pro_sharpa_relative_eef_mecka": {
161
+ "video": {
162
+ "delta_indices": [
163
+ -30,
164
+ 0
165
+ ],
166
+ "modality_keys": [
167
+ "ego_view_cropratio_res320x240_freq30"
168
+ ],
169
+ "sin_cos_embedding_keys": null,
170
+ "mean_std_embedding_keys": null,
171
+ "action_configs": null
172
+ },
173
+ "state": {
174
+ "delta_indices": [
175
+ 0
176
+ ],
177
+ "modality_keys": [
178
+ "left_wrist_eef",
179
+ "right_wrist_eef",
180
+ "left_hand_joints",
181
+ "right_hand_joints"
182
+ ],
183
+ "sin_cos_embedding_keys": null,
184
+ "mean_std_embedding_keys": null,
185
+ "action_configs": null
186
+ },
187
+ "action": {
188
+ "delta_indices": [
189
+ 0,
190
+ 1,
191
+ 2,
192
+ 3,
193
+ 4,
194
+ 5,
195
+ 6,
196
+ 7,
197
+ 8,
198
+ 9,
199
+ 10,
200
+ 11,
201
+ 12,
202
+ 13,
203
+ 14,
204
+ 15,
205
+ 16,
206
+ 17,
207
+ 18,
208
+ 19,
209
+ 20,
210
+ 21,
211
+ 22,
212
+ 23,
213
+ 24,
214
+ 25,
215
+ 26,
216
+ 27,
217
+ 28,
218
+ 29,
219
+ 30,
220
+ 31,
221
+ 32,
222
+ 33,
223
+ 34,
224
+ 35,
225
+ 36,
226
+ 37,
227
+ 38,
228
+ 39
229
+ ],
230
+ "modality_keys": [
231
+ "left_wrist_eef",
232
+ "right_wrist_eef",
233
+ "left_hand_joints",
234
+ "right_hand_joints"
235
+ ],
236
+ "sin_cos_embedding_keys": null,
237
+ "mean_std_embedding_keys": null,
238
+ "action_configs": [
239
+ {
240
+ "rep": "RELATIVE",
241
+ "type": "EEF",
242
+ "format": "XYZ_ROT6D",
243
+ "state_key": "left_wrist_eef"
244
+ },
245
+ {
246
+ "rep": "RELATIVE",
247
+ "type": "EEF",
248
+ "format": "XYZ_ROT6D",
249
+ "state_key": "right_wrist_eef"
250
+ },
251
+ {
252
+ "rep": "ABSOLUTE",
253
+ "type": "NON_EEF",
254
+ "format": "DEFAULT",
255
+ "state_key": "left_hand_joints"
256
+ },
257
+ {
258
+ "rep": "ABSOLUTE",
259
+ "type": "NON_EEF",
260
+ "format": "DEFAULT",
261
+ "state_key": "right_hand_joints"
262
+ }
263
+ ]
264
+ },
265
+ "language": {
266
+ "delta_indices": [
267
+ 0
268
+ ],
269
+ "modality_keys": [
270
+ "annotation.human.coarse_action"
271
+ ],
272
+ "sin_cos_embedding_keys": null,
273
+ "mean_std_embedding_keys": null,
274
+ "action_configs": null
275
+ }
276
+ },
277
+ "real_r1_pro_sharpa_relative_eef_human": {
278
+ "video": {
279
+ "delta_indices": [
280
+ -20,
281
+ 0
282
+ ],
283
+ "modality_keys": [
284
+ "ego_view_res320x240_freq20",
285
+ "left_wrist_view_res320x240_freq20",
286
+ "right_wrist_view_res320x240_freq20"
287
+ ],
288
+ "sin_cos_embedding_keys": null,
289
+ "mean_std_embedding_keys": null,
290
+ "action_configs": null
291
+ },
292
+ "state": {
293
+ "delta_indices": [
294
+ 0
295
+ ],
296
+ "modality_keys": [
297
+ "left_wrist_eef",
298
+ "right_wrist_eef",
299
+ "left_hand_joints",
300
+ "right_hand_joints"
301
+ ],
302
+ "sin_cos_embedding_keys": null,
303
+ "mean_std_embedding_keys": null,
304
+ "action_configs": null
305
+ },
306
+ "action": {
307
+ "delta_indices": [
308
+ 0,
309
+ 1,
310
+ 2,
311
+ 3,
312
+ 4,
313
+ 5,
314
+ 6,
315
+ 7,
316
+ 8,
317
+ 9,
318
+ 10,
319
+ 11,
320
+ 12,
321
+ 13,
322
+ 14,
323
+ 15,
324
+ 16,
325
+ 17,
326
+ 18,
327
+ 19,
328
+ 20,
329
+ 21,
330
+ 22,
331
+ 23,
332
+ 24,
333
+ 25,
334
+ 26,
335
+ 27,
336
+ 28,
337
+ 29,
338
+ 30,
339
+ 31,
340
+ 32,
341
+ 33,
342
+ 34,
343
+ 35,
344
+ 36,
345
+ 37,
346
+ 38,
347
+ 39
348
+ ],
349
+ "modality_keys": [
350
+ "left_wrist_eef",
351
+ "right_wrist_eef",
352
+ "left_hand_joints",
353
+ "right_hand_joints"
354
+ ],
355
+ "sin_cos_embedding_keys": null,
356
+ "mean_std_embedding_keys": null,
357
+ "action_configs": [
358
+ {
359
+ "rep": "RELATIVE",
360
+ "type": "EEF",
361
+ "format": "XYZ_ROT6D",
362
+ "state_key": "left_wrist_eef"
363
+ },
364
+ {
365
+ "rep": "RELATIVE",
366
+ "type": "EEF",
367
+ "format": "XYZ_ROT6D",
368
+ "state_key": "right_wrist_eef"
369
+ },
370
+ {
371
+ "rep": "ABSOLUTE",
372
+ "type": "NON_EEF",
373
+ "format": "DEFAULT",
374
+ "state_key": "left_hand_joints"
375
+ },
376
+ {
377
+ "rep": "ABSOLUTE",
378
+ "type": "NON_EEF",
379
+ "format": "DEFAULT",
380
+ "state_key": "right_hand_joints"
381
+ }
382
+ ]
383
+ },
384
+ "language": {
385
+ "delta_indices": [
386
+ 0
387
+ ],
388
+ "modality_keys": [
389
+ "annotation.human.coarse_action"
390
+ ],
391
+ "sin_cos_embedding_keys": null,
392
+ "mean_std_embedding_keys": null,
393
+ "action_configs": null
394
+ }
395
+ },
396
+ "real_r1_pro_sharpa_relative_eef": {
397
+ "video": {
398
+ "delta_indices": [
399
+ -20,
400
+ 0
401
+ ],
402
+ "modality_keys": [
403
+ "ego_view_res320x240_freq20",
404
+ "left_wrist_view_res320x240_freq20",
405
+ "right_wrist_view_res320x240_freq20"
406
+ ],
407
+ "sin_cos_embedding_keys": null,
408
+ "mean_std_embedding_keys": null,
409
+ "action_configs": null
410
+ },
411
+ "state": {
412
+ "delta_indices": [
413
+ 0
414
+ ],
415
+ "modality_keys": [
416
+ "left_wrist_eef",
417
+ "right_wrist_eef",
418
+ "left_hand_joints",
419
+ "right_hand_joints"
420
+ ],
421
+ "sin_cos_embedding_keys": null,
422
+ "mean_std_embedding_keys": null,
423
+ "action_configs": null
424
+ },
425
+ "action": {
426
+ "delta_indices": [
427
+ 0,
428
+ 1,
429
+ 2,
430
+ 3,
431
+ 4,
432
+ 5,
433
+ 6,
434
+ 7,
435
+ 8,
436
+ 9,
437
+ 10,
438
+ 11,
439
+ 12,
440
+ 13,
441
+ 14,
442
+ 15,
443
+ 16,
444
+ 17,
445
+ 18,
446
+ 19,
447
+ 20,
448
+ 21,
449
+ 22,
450
+ 23,
451
+ 24,
452
+ 25,
453
+ 26,
454
+ 27,
455
+ 28,
456
+ 29,
457
+ 30,
458
+ 31,
459
+ 32,
460
+ 33,
461
+ 34,
462
+ 35,
463
+ 36,
464
+ 37,
465
+ 38,
466
+ 39
467
+ ],
468
+ "modality_keys": [
469
+ "left_wrist_eef",
470
+ "right_wrist_eef",
471
+ "left_hand_joints",
472
+ "right_hand_joints"
473
+ ],
474
+ "sin_cos_embedding_keys": null,
475
+ "mean_std_embedding_keys": null,
476
+ "action_configs": [
477
+ {
478
+ "rep": "RELATIVE",
479
+ "type": "EEF",
480
+ "format": "XYZ_ROT6D",
481
+ "state_key": "left_wrist_eef"
482
+ },
483
+ {
484
+ "rep": "RELATIVE",
485
+ "type": "EEF",
486
+ "format": "XYZ_ROT6D",
487
+ "state_key": "right_wrist_eef"
488
+ },
489
+ {
490
+ "rep": "ABSOLUTE",
491
+ "type": "NON_EEF",
492
+ "format": "DEFAULT",
493
+ "state_key": "left_hand_joints"
494
+ },
495
+ {
496
+ "rep": "ABSOLUTE",
497
+ "type": "NON_EEF",
498
+ "format": "DEFAULT",
499
+ "state_key": "right_hand_joints"
500
+ }
501
+ ]
502
+ },
503
+ "language": {
504
+ "delta_indices": [
505
+ 0
506
+ ],
507
+ "modality_keys": [
508
+ "annotation.human.coarse_action"
509
+ ],
510
+ "sin_cos_embedding_keys": null,
511
+ "mean_std_embedding_keys": null,
512
+ "action_configs": null
513
+ }
514
+ },
515
+ "xdof_relative_eef_relative_joint": {
516
+ "video": {
517
+ "delta_indices": [
518
+ -30,
519
+ 0
520
+ ],
521
+ "modality_keys": [
522
+ "top_camera-images-rgb_320_240",
523
+ "left_camera-images-rgb_320_240",
524
+ "right_camera-images-rgb_320_240"
525
+ ],
526
+ "sin_cos_embedding_keys": null,
527
+ "mean_std_embedding_keys": null,
528
+ "action_configs": null
529
+ },
530
+ "state": {
531
+ "delta_indices": [
532
+ 0
533
+ ],
534
+ "modality_keys": [
535
+ "left_wrist_eef",
536
+ "right_wrist_eef",
537
+ "left_gripper_pos",
538
+ "right_gripper_pos",
539
+ "left_joint_pos",
540
+ "right_joint_pos"
541
+ ],
542
+ "sin_cos_embedding_keys": null,
543
+ "mean_std_embedding_keys": null,
544
+ "action_configs": null
545
+ },
546
+ "action": {
547
+ "delta_indices": [
548
+ 0,
549
+ 1,
550
+ 2,
551
+ 3,
552
+ 4,
553
+ 5,
554
+ 6,
555
+ 7,
556
+ 8,
557
+ 9,
558
+ 10,
559
+ 11,
560
+ 12,
561
+ 13,
562
+ 14,
563
+ 15,
564
+ 16,
565
+ 17,
566
+ 18,
567
+ 19,
568
+ 20,
569
+ 21,
570
+ 22,
571
+ 23,
572
+ 24,
573
+ 25,
574
+ 26,
575
+ 27,
576
+ 28,
577
+ 29,
578
+ 30,
579
+ 31,
580
+ 32,
581
+ 33,
582
+ 34,
583
+ 35,
584
+ 36,
585
+ 37,
586
+ 38,
587
+ 39
588
+ ],
589
+ "modality_keys": [
590
+ "left_wrist_eef",
591
+ "right_wrist_eef",
592
+ "left_gripper_pos",
593
+ "right_gripper_pos",
594
+ "left_joint_pos",
595
+ "right_joint_pos"
596
+ ],
597
+ "sin_cos_embedding_keys": null,
598
+ "mean_std_embedding_keys": null,
599
+ "action_configs": [
600
+ {
601
+ "rep": "RELATIVE",
602
+ "type": "EEF",
603
+ "format": "XYZ_ROT6D",
604
+ "state_key": "left_wrist_eef"
605
+ },
606
+ {
607
+ "rep": "RELATIVE",
608
+ "type": "EEF",
609
+ "format": "XYZ_ROT6D",
610
+ "state_key": "right_wrist_eef"
611
+ },
612
+ {
613
+ "rep": "ABSOLUTE",
614
+ "type": "NON_EEF",
615
+ "format": "DEFAULT",
616
+ "state_key": "left_gripper_pos"
617
+ },
618
+ {
619
+ "rep": "ABSOLUTE",
620
+ "type": "NON_EEF",
621
+ "format": "DEFAULT",
622
+ "state_key": "right_gripper_pos"
623
+ },
624
+ {
625
+ "rep": "RELATIVE",
626
+ "type": "NON_EEF",
627
+ "format": "DEFAULT",
628
+ "state_key": "left_joint_pos"
629
+ },
630
+ {
631
+ "rep": "RELATIVE",
632
+ "type": "NON_EEF",
633
+ "format": "DEFAULT",
634
+ "state_key": "right_joint_pos"
635
+ }
636
+ ]
637
+ },
638
+ "language": {
639
+ "delta_indices": [
640
+ 0
641
+ ],
642
+ "modality_keys": [
643
+ "annotation.task"
644
+ ],
645
+ "sin_cos_embedding_keys": null,
646
+ "mean_std_embedding_keys": null,
647
+ "action_configs": null
648
+ }
649
+ },
650
+ "real_r1_pro_sharpa_relative_eef_maxinsights": {
651
+ "video": {
652
+ "delta_indices": [
653
+ -30,
654
+ 0
655
+ ],
656
+ "modality_keys": [
657
+ "ego_view_cropratio_res320x240_freq30"
658
+ ],
659
+ "sin_cos_embedding_keys": null,
660
+ "mean_std_embedding_keys": null,
661
+ "action_configs": null
662
+ },
663
+ "state": {
664
+ "delta_indices": [
665
+ 0
666
+ ],
667
+ "modality_keys": [
668
+ "left_wrist_eef",
669
+ "right_wrist_eef",
670
+ "left_hand_joints",
671
+ "right_hand_joints"
672
+ ],
673
+ "sin_cos_embedding_keys": null,
674
+ "mean_std_embedding_keys": null,
675
+ "action_configs": null
676
+ },
677
+ "action": {
678
+ "delta_indices": [
679
+ 0,
680
+ 1,
681
+ 2,
682
+ 3,
683
+ 4,
684
+ 5,
685
+ 6,
686
+ 7,
687
+ 8,
688
+ 9,
689
+ 10,
690
+ 11,
691
+ 12,
692
+ 13,
693
+ 14,
694
+ 15,
695
+ 16,
696
+ 17,
697
+ 18,
698
+ 19,
699
+ 20,
700
+ 21,
701
+ 22,
702
+ 23,
703
+ 24,
704
+ 25,
705
+ 26,
706
+ 27,
707
+ 28,
708
+ 29,
709
+ 30,
710
+ 31,
711
+ 32,
712
+ 33,
713
+ 34,
714
+ 35,
715
+ 36,
716
+ 37,
717
+ 38,
718
+ 39
719
+ ],
720
+ "modality_keys": [
721
+ "left_wrist_eef",
722
+ "right_wrist_eef",
723
+ "left_hand_joints",
724
+ "right_hand_joints"
725
+ ],
726
+ "sin_cos_embedding_keys": null,
727
+ "mean_std_embedding_keys": null,
728
+ "action_configs": [
729
+ {
730
+ "rep": "RELATIVE",
731
+ "type": "EEF",
732
+ "format": "XYZ_ROT6D",
733
+ "state_key": "left_wrist_eef"
734
+ },
735
+ {
736
+ "rep": "RELATIVE",
737
+ "type": "EEF",
738
+ "format": "XYZ_ROT6D",
739
+ "state_key": "right_wrist_eef"
740
+ },
741
+ {
742
+ "rep": "ABSOLUTE",
743
+ "type": "NON_EEF",
744
+ "format": "DEFAULT",
745
+ "state_key": "left_hand_joints"
746
+ },
747
+ {
748
+ "rep": "ABSOLUTE",
749
+ "type": "NON_EEF",
750
+ "format": "DEFAULT",
751
+ "state_key": "right_hand_joints"
752
+ }
753
+ ]
754
+ },
755
+ "language": {
756
+ "delta_indices": [
757
+ 0
758
+ ],
759
+ "modality_keys": [
760
+ "annotation.human.coarse_action"
761
+ ],
762
+ "sin_cos_embedding_keys": null,
763
+ "mean_std_embedding_keys": null,
764
+ "action_configs": null
765
+ }
766
+ },
767
+ "xdof_relative_eef_relative_joint_subtask": {
768
+ "video": {
769
+ "delta_indices": [
770
+ -30,
771
+ 0
772
+ ],
773
+ "modality_keys": [
774
+ "top_camera-images-rgb_320_240",
775
+ "left_camera-images-rgb_320_240",
776
+ "right_camera-images-rgb_320_240"
777
+ ],
778
+ "sin_cos_embedding_keys": null,
779
+ "mean_std_embedding_keys": null,
780
+ "action_configs": null
781
+ },
782
+ "state": {
783
+ "delta_indices": [
784
+ 0
785
+ ],
786
+ "modality_keys": [
787
+ "left_wrist_eef",
788
+ "right_wrist_eef",
789
+ "left_gripper_pos",
790
+ "right_gripper_pos",
791
+ "left_joint_pos",
792
+ "right_joint_pos"
793
+ ],
794
+ "sin_cos_embedding_keys": null,
795
+ "mean_std_embedding_keys": null,
796
+ "action_configs": null
797
+ },
798
+ "action": {
799
+ "delta_indices": [
800
+ 0,
801
+ 1,
802
+ 2,
803
+ 3,
804
+ 4,
805
+ 5,
806
+ 6,
807
+ 7,
808
+ 8,
809
+ 9,
810
+ 10,
811
+ 11,
812
+ 12,
813
+ 13,
814
+ 14,
815
+ 15,
816
+ 16,
817
+ 17,
818
+ 18,
819
+ 19,
820
+ 20,
821
+ 21,
822
+ 22,
823
+ 23,
824
+ 24,
825
+ 25,
826
+ 26,
827
+ 27,
828
+ 28,
829
+ 29,
830
+ 30,
831
+ 31,
832
+ 32,
833
+ 33,
834
+ 34,
835
+ 35,
836
+ 36,
837
+ 37,
838
+ 38,
839
+ 39
840
+ ],
841
+ "modality_keys": [
842
+ "left_wrist_eef",
843
+ "right_wrist_eef",
844
+ "left_gripper_pos",
845
+ "right_gripper_pos",
846
+ "left_joint_pos",
847
+ "right_joint_pos"
848
+ ],
849
+ "sin_cos_embedding_keys": null,
850
+ "mean_std_embedding_keys": null,
851
+ "action_configs": [
852
+ {
853
+ "rep": "RELATIVE",
854
+ "type": "EEF",
855
+ "format": "XYZ_ROT6D",
856
+ "state_key": "left_wrist_eef"
857
+ },
858
+ {
859
+ "rep": "RELATIVE",
860
+ "type": "EEF",
861
+ "format": "XYZ_ROT6D",
862
+ "state_key": "right_wrist_eef"
863
+ },
864
+ {
865
+ "rep": "ABSOLUTE",
866
+ "type": "NON_EEF",
867
+ "format": "DEFAULT",
868
+ "state_key": "left_gripper_pos"
869
+ },
870
+ {
871
+ "rep": "ABSOLUTE",
872
+ "type": "NON_EEF",
873
+ "format": "DEFAULT",
874
+ "state_key": "right_gripper_pos"
875
+ },
876
+ {
877
+ "rep": "RELATIVE",
878
+ "type": "NON_EEF",
879
+ "format": "DEFAULT",
880
+ "state_key": "left_joint_pos"
881
+ },
882
+ {
883
+ "rep": "RELATIVE",
884
+ "type": "NON_EEF",
885
+ "format": "DEFAULT",
886
+ "state_key": "right_joint_pos"
887
+ }
888
+ ]
889
+ },
890
+ "language": {
891
+ "delta_indices": [
892
+ 0
893
+ ],
894
+ "modality_keys": [
895
+ "annotation.sub_task"
896
+ ],
897
+ "sin_cos_embedding_keys": null,
898
+ "mean_std_embedding_keys": null,
899
+ "action_configs": null
900
+ }
901
+ },
902
+ "oxe_droid_relative_eef_relative_joint": {
903
+ "video": {
904
+ "delta_indices": [
905
+ -15,
906
+ 0
907
+ ],
908
+ "modality_keys": [
909
+ "exterior_image_1_left",
910
+ "wrist_image_left"
911
+ ],
912
+ "sin_cos_embedding_keys": null,
913
+ "mean_std_embedding_keys": null,
914
+ "action_configs": null
915
+ },
916
+ "state": {
917
+ "delta_indices": [
918
+ 0
919
+ ],
920
+ "modality_keys": [
921
+ "eef_9d",
922
+ "gripper_position",
923
+ "joint_position"
924
+ ],
925
+ "sin_cos_embedding_keys": null,
926
+ "mean_std_embedding_keys": null,
927
+ "action_configs": null
928
+ },
929
+ "action": {
930
+ "delta_indices": [
931
+ 0,
932
+ 1,
933
+ 2,
934
+ 3,
935
+ 4,
936
+ 5,
937
+ 6,
938
+ 7,
939
+ 8,
940
+ 9,
941
+ 10,
942
+ 11,
943
+ 12,
944
+ 13,
945
+ 14,
946
+ 15,
947
+ 16,
948
+ 17,
949
+ 18,
950
+ 19,
951
+ 20,
952
+ 21,
953
+ 22,
954
+ 23,
955
+ 24,
956
+ 25,
957
+ 26,
958
+ 27,
959
+ 28,
960
+ 29,
961
+ 30,
962
+ 31,
963
+ 32,
964
+ 33,
965
+ 34,
966
+ 35,
967
+ 36,
968
+ 37,
969
+ 38,
970
+ 39
971
+ ],
972
+ "modality_keys": [
973
+ "eef_9d",
974
+ "gripper_position",
975
+ "joint_position"
976
+ ],
977
+ "sin_cos_embedding_keys": null,
978
+ "mean_std_embedding_keys": null,
979
+ "action_configs": [
980
+ {
981
+ "rep": "RELATIVE",
982
+ "type": "EEF",
983
+ "format": "XYZ_ROT6D",
984
+ "state_key": "eef_9d"
985
+ },
986
+ {
987
+ "rep": "ABSOLUTE",
988
+ "type": "NON_EEF",
989
+ "format": "DEFAULT",
990
+ "state_key": "gripper_position"
991
+ },
992
+ {
993
+ "rep": "RELATIVE",
994
+ "type": "NON_EEF",
995
+ "format": "DEFAULT",
996
+ "state_key": "joint_position"
997
+ }
998
+ ]
999
+ },
1000
+ "language": {
1001
+ "delta_indices": [
1002
+ 0
1003
+ ],
1004
+ "modality_keys": [
1005
+ "annotation.language.language_instruction"
1006
+ ],
1007
+ "sin_cos_embedding_keys": null,
1008
+ "mean_std_embedding_keys": null,
1009
+ "action_configs": null
1010
+ }
1011
+ },
1012
+ "new_embodiment": {
1013
+ "video": {
1014
+ "delta_indices": [
1015
+ 0
1016
+ ],
1017
+ "modality_keys": [
1018
+ "cam_left_head",
1019
+ "cam_left_wrist",
1020
+ "cam_right_wrist"
1021
+ ],
1022
+ "sin_cos_embedding_keys": null,
1023
+ "mean_std_embedding_keys": null,
1024
+ "action_configs": null
1025
+ },
1026
+ "state": {
1027
+ "delta_indices": [
1028
+ 0
1029
+ ],
1030
+ "modality_keys": [
1031
+ "arm_left",
1032
+ "arm_right",
1033
+ "head",
1034
+ "lift",
1035
+ "odometry"
1036
+ ],
1037
+ "sin_cos_embedding_keys": null,
1038
+ "mean_std_embedding_keys": null,
1039
+ "action_configs": null
1040
+ },
1041
+ "action": {
1042
+ "delta_indices": [
1043
+ 0,
1044
+ 1,
1045
+ 2,
1046
+ 3,
1047
+ 4,
1048
+ 5,
1049
+ 6,
1050
+ 7,
1051
+ 8,
1052
+ 9,
1053
+ 10,
1054
+ 11,
1055
+ 12,
1056
+ 13,
1057
+ 14,
1058
+ 15
1059
+ ],
1060
+ "modality_keys": [
1061
+ "arm_left",
1062
+ "arm_right",
1063
+ "head",
1064
+ "lift",
1065
+ "odometry"
1066
+ ],
1067
+ "sin_cos_embedding_keys": null,
1068
+ "mean_std_embedding_keys": null,
1069
+ "action_configs": [
1070
+ {
1071
+ "rep": "ABSOLUTE",
1072
+ "type": "NON_EEF",
1073
+ "format": "DEFAULT",
1074
+ "state_key": null
1075
+ },
1076
+ {
1077
+ "rep": "ABSOLUTE",
1078
+ "type": "NON_EEF",
1079
+ "format": "DEFAULT",
1080
+ "state_key": null
1081
+ },
1082
+ {
1083
+ "rep": "ABSOLUTE",
1084
+ "type": "NON_EEF",
1085
+ "format": "DEFAULT",
1086
+ "state_key": null
1087
+ },
1088
+ {
1089
+ "rep": "ABSOLUTE",
1090
+ "type": "NON_EEF",
1091
+ "format": "DEFAULT",
1092
+ "state_key": null
1093
+ },
1094
+ {
1095
+ "rep": "ABSOLUTE",
1096
+ "type": "NON_EEF",
1097
+ "format": "DEFAULT",
1098
+ "state_key": null
1099
+ }
1100
+ ]
1101
+ },
1102
+ "language": {
1103
+ "delta_indices": [
1104
+ 0
1105
+ ],
1106
+ "modality_keys": [
1107
+ "annotation.human.primitive_instruction"
1108
+ ],
1109
+ "sin_cos_embedding_keys": null,
1110
+ "mean_std_embedding_keys": null,
1111
+ "action_configs": null
1112
+ }
1113
+ }
1114
+ },
1115
+ "image_crop_size": [
1116
+ 230,
1117
+ 230
1118
+ ],
1119
+ "image_target_size": [
1120
+ 256,
1121
+ 256
1122
+ ],
1123
+ "use_albumentations": true,
1124
+ "random_rotation_angle": 0,
1125
+ "color_jitter_params": {
1126
+ "brightness": 0.3,
1127
+ "contrast": 0.4,
1128
+ "saturation": 0.5,
1129
+ "hue": 0.08
1130
+ },
1131
+ "shortest_image_edge": 256,
1132
+ "crop_fraction": 0.95,
1133
+ "letter_box_transform": false,
1134
+ "model_name": "nvidia/Cosmos-Reason2-2B",
1135
+ "model_type": "qwen",
1136
+ "formalize_language": true,
1137
+ "max_state_dim": 132,
1138
+ "max_action_dim": 132,
1139
+ "max_action_horizon": 40,
1140
+ "use_percentiles": true,
1141
+ "use_mean_std": false,
1142
+ "clip_outliers": true,
1143
+ "apply_sincos_state_encoding": false,
1144
+ "use_relative_action": true,
1145
+ "exclude_state": false,
1146
+ "state_dropout_prob": 0.2
1147
+ }
1148
+ }
statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0cb4269fe3884109f55023fd3e6c0bcdeedeba7a95dc47aa77e1b05e68732a4
3
+ size 6033
wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d7", "run_id": "groot_stage2_persubtask_percam_1k_0515_0303_0515_0358"}