Dongkkka commited on
Commit
bfb72e5
·
verified ·
1 Parent(s): 9f5d8f6

Upload folder using huggingface_hub

Browse files
*.pth/config.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 40,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": false,
5
+ "architectures": [
6
+ "Gr00tN1d7"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_trainable_params_fp32": true,
12
+ "clip_outliers": true,
13
+ "color_jitter_params": {
14
+ "brightness": 0.3,
15
+ "contrast": 0.4,
16
+ "hue": 0.08,
17
+ "saturation": 0.5
18
+ },
19
+ "crop_fraction": 0.95,
20
+ "diffusion_model_cfg": {
21
+ "attention_head_dim": 48,
22
+ "dropout": 0.2,
23
+ "final_dropout": true,
24
+ "interleave_self_attention": true,
25
+ "norm_type": "ada_norm",
26
+ "num_attention_heads": 32,
27
+ "num_layers": 32,
28
+ "output_dim": 1024,
29
+ "positional_embeddings": null
30
+ },
31
+ "dtype": "float32",
32
+ "exclude_state": false,
33
+ "formalize_language": true,
34
+ "hidden_size": 1024,
35
+ "image_crop_size": [
36
+ 230,
37
+ 230
38
+ ],
39
+ "image_target_size": [
40
+ 256,
41
+ 256
42
+ ],
43
+ "letter_box_transform": false,
44
+ "load_bf16": false,
45
+ "max_action_dim": 132,
46
+ "max_num_embodiments": 32,
47
+ "max_seq_len": 1024,
48
+ "max_state_dim": 132,
49
+ "model_dtype": "bfloat16",
50
+ "model_name": "nvidia/Cosmos-Reason2-2B",
51
+ "model_type": "Gr00tN1d7",
52
+ "noise_beta_alpha": 1.5,
53
+ "noise_beta_beta": 1.0,
54
+ "noise_s": 0.999,
55
+ "num_inference_timesteps": 4,
56
+ "num_timestep_buckets": 1000,
57
+ "random_history_crop": true,
58
+ "random_rotation_angle": 0,
59
+ "reproject_vision": false,
60
+ "rtc_ramp_rate": 6.0,
61
+ "select_layer": 16,
62
+ "shortest_image_edge": 256,
63
+ "state_dropout_prob": 0.2,
64
+ "state_gaussian_noise_std": 0.0,
65
+ "transformers_version": "4.57.3",
66
+ "tune_diffusion_model": true,
67
+ "tune_linear": true,
68
+ "tune_llm": false,
69
+ "tune_projector": true,
70
+ "tune_top_llm_layers": 0,
71
+ "tune_visual": false,
72
+ "tune_vlln": true,
73
+ "use_albumentations": true,
74
+ "use_alternate_vl_dit": true,
75
+ "use_flash_attention": true,
76
+ "use_future_tokens": false,
77
+ "use_mean_std": false,
78
+ "use_percentiles": false,
79
+ "use_relative_action": true,
80
+ "use_vl_self_attention": true,
81
+ "use_vlln": true,
82
+ "vl_self_attention_cfg": {
83
+ "attention_head_dim": 64,
84
+ "dropout": 0.2,
85
+ "final_dropout": true,
86
+ "num_attention_heads": 32,
87
+ "num_layers": 4,
88
+ "positional_embeddings": null
89
+ }
90
+ }
*.pth/embodiment_id.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "oxe_droid": 17,
4
+ "oxe_fractal": 18,
5
+ "oxe_language_table": 19,
6
+ "oxe_bridge": 20,
7
+ "unknown": 22,
8
+ "gr1_unified": 20,
9
+ "agibot": 26,
10
+ "sim_behavior_r1_pro": 23,
11
+ "xdof": 24,
12
+ "xdof_oss_data": 25,
13
+ "unitree_g1_full_body_with_waist_height_nav_cmd": 25,
14
+ "real_r1_pro_sharpa": 27,
15
+ "real_r1_pro_sharpa_add_view": 27,
16
+ "real_r1_pro_sharpa_relative_arm_joint": 26,
17
+ "real_r1_pro_sharpa_delta_eef": 26,
18
+ "real_r1_pro_sharpa_absolute_eef": 26,
19
+ "real_r1_pro_sharpa_meanstd": 26,
20
+ "real_r1_pro_sharpa_relative_eef": 26,
21
+ "real_r1_pro_sharpa_relative_eef_add_view": 26,
22
+ "real_r1_pro_sharpa_relative_eef_relative_hand": 26,
23
+ "real_r1_pro_sharpa_relative_eef_human": 26,
24
+ "real_r1_pro_sharpa_relative_eef_human_add_view": 26,
25
+ "real_r1_pro_sharpa_relative_eef_human_relative_hand": 26,
26
+ "real_r1_pro_sharpa_relative_eef_egodex": 26,
27
+ "real_r1_pro_sharpa_relative_eef_egodex_relative_hand": 26,
28
+ "real_r1_pro_sharpa_relative_eef_egodex_wrist_only": 26,
29
+ "real_r1_pro_sharpa_relative_eef_maxinsights": 26,
30
+ "real_r1_pro_sharpa_relative_eef_maxinsights_relative_hand": 26,
31
+ "real_r1_pro_sharpa_relative_eef_mecka": 26,
32
+ "real_r1_pro_sharpa_relative_eef_mecka_relative_hand": 26,
33
+ "real_g1_relative_eef_absolute_joints": 25,
34
+ "real_g1_relative_eef_absolute_joints_wrist_cam": 25,
35
+ "real_g1_relative_eef_relative_joints": 25,
36
+ "real_r1_pro_sharpa_relative_eef_relative_hand_relative_joint": 26,
37
+ "real_r1_pro_sharpa_relative_joint": 29,
38
+ "oxe_droid_relative_eef_relative_joint": 24,
39
+ "oxe_droid_relative_eef_relative_joint_swapped": 24,
40
+ "oxe_droid_relative_eef_relative_joint_upweight_z": 24,
41
+ "oxe_droid_relative_eef_relative_joint_upweight_z_swapped": 24,
42
+ "oxe_droid_relative_eef_relative_joint_3view": 24,
43
+ "oxe_droid_relative_eef_relative_joint_3view_swapped": 24,
44
+ "oxe_droid_relative_eef": 24,
45
+ "oxe_droid_joint_position_relative": 24,
46
+ "xdof_relative_eef_relative_joint": 27,
47
+ "xdof_relative_eef_relative_joint_subtask": 27,
48
+ "xdof_relative_eef": 27,
49
+ "xdof_relative_joint": 28,
50
+ "simpler_env_google": 0,
51
+ "simpler_env_widowx": 1,
52
+ "libero_sim": 2,
53
+ "droid_sim": 3,
54
+ "new_embodiment": 10
55
+ }
*.pth/experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d7
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Cosmos-Reason2-2B
6
+ backbone_model_type: qwen
7
+ model_revision: null
8
+ tune_top_llm_layers: 0
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 12
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ backbone_trainable_params_fp32: true
17
+ image_crop_size:
18
+ - 230
19
+ - 230
20
+ image_target_size:
21
+ - 256
22
+ - 256
23
+ shortest_image_edge: null
24
+ crop_fraction: null
25
+ random_rotation_angle: null
26
+ color_jitter_params: null
27
+ use_albumentations_transforms: true
28
+ extra_augmentation_config: null
29
+ formalize_language: true
30
+ apply_sincos_state_encoding: false
31
+ use_percentiles: false
32
+ clip_outliers: true
33
+ use_relative_action: true
34
+ max_state_dim: 132
35
+ max_action_dim: 132
36
+ action_horizon: 40
37
+ hidden_size: 1024
38
+ input_embedding_dim: 1536
39
+ state_history_length: 1
40
+ add_pos_embed: true
41
+ attn_dropout: 0.2
42
+ use_vlln: true
43
+ max_seq_len: 1024
44
+ use_alternate_vl_dit: true
45
+ attend_text_every_n_blocks: 2
46
+ diffusion_model_cfg:
47
+ positional_embeddings: null
48
+ num_layers: 16
49
+ num_attention_heads: 32
50
+ attention_head_dim: 48
51
+ norm_type: ada_norm
52
+ dropout: 0.2
53
+ final_dropout: true
54
+ output_dim: 1024
55
+ interleave_self_attention: true
56
+ num_inference_timesteps: 4
57
+ noise_beta_alpha: 1.5
58
+ noise_beta_beta: 1.0
59
+ noise_s: 0.999
60
+ num_timestep_buckets: 1000
61
+ tune_projector: true
62
+ tune_diffusion_model: true
63
+ tune_vlln: true
64
+ state_dropout_prob: 0.2
65
+ exclude_state: false
66
+ use_mean_std: false
67
+ max_num_embodiments: 32
68
+ data:
69
+ datasets:
70
+ - dataset_paths:
71
+ - /data/dataset
72
+ embodiment_tag: new_embodiment
73
+ mix_ratio: 1.0
74
+ dataset_type: physical_embodiment
75
+ val_dataset_path: null
76
+ modality_configs:
77
+ new_embodiment:
78
+ video:
79
+ delta_indices:
80
+ - 0
81
+ modality_keys:
82
+ - cam_left_head
83
+ - cam_left_wrist
84
+ - cam_right_wrist
85
+ sin_cos_embedding_keys: null
86
+ mean_std_embedding_keys: null
87
+ action_configs: null
88
+ state:
89
+ delta_indices:
90
+ - 0
91
+ modality_keys:
92
+ - arm_left
93
+ - arm_right
94
+ - odometry
95
+ sin_cos_embedding_keys: null
96
+ mean_std_embedding_keys: null
97
+ action_configs: null
98
+ action:
99
+ delta_indices:
100
+ - 0
101
+ - 1
102
+ - 2
103
+ - 3
104
+ - 4
105
+ - 5
106
+ - 6
107
+ - 7
108
+ - 8
109
+ - 9
110
+ - 10
111
+ - 11
112
+ - 12
113
+ - 13
114
+ - 14
115
+ - 15
116
+ modality_keys:
117
+ - arm_left
118
+ - arm_right
119
+ - odometry
120
+ sin_cos_embedding_keys: null
121
+ mean_std_embedding_keys: null
122
+ action_configs:
123
+ - rep: ABSOLUTE
124
+ type: NON_EEF
125
+ format: DEFAULT
126
+ state_key: null
127
+ - rep: ABSOLUTE
128
+ type: NON_EEF
129
+ format: DEFAULT
130
+ state_key: null
131
+ - rep: ABSOLUTE
132
+ type: NON_EEF
133
+ format: DEFAULT
134
+ state_key: null
135
+ language:
136
+ delta_indices:
137
+ - 0
138
+ modality_keys:
139
+ - annotation.human.task_description
140
+ sin_cos_embedding_keys: null
141
+ mean_std_embedding_keys: null
142
+ action_configs: null
143
+ download_cache: false
144
+ shard_size: 1024
145
+ episode_sampling_rate: 0.1
146
+ num_shards_per_epoch: 100000
147
+ override_pretraining_statistics: true
148
+ mode: single_turn
149
+ random_chop: 0.0
150
+ mock_dataset_mode: false
151
+ shuffle: true
152
+ seed: 42
153
+ multiprocessing_context: fork
154
+ allow_padding: false
155
+ subsample_ratio: 1.0
156
+ image_crop_size:
157
+ - 244
158
+ - 244
159
+ image_target_size:
160
+ - 224
161
+ - 224
162
+ video_backend: torchcodec
163
+ training:
164
+ output_dir: /data/checkpoints/task0025_subtask_lang_final_layer_vlm_threecam_minmax_80k_gpu1_20260618_045240
165
+ experiment_name: task0025_subtask_lang_final_layer_vlm_threecam_minmax_80k_gpu1_20260618_045240
166
+ max_steps: 80000
167
+ global_batch_size: 8
168
+ batch_size: null
169
+ gradient_accumulation_steps: 1
170
+ learning_rate: 0.0001
171
+ lr_scheduler_type: cosine
172
+ weight_decay: 1.0e-05
173
+ warmup_ratio: 0.05
174
+ warmup_steps: 0
175
+ max_grad_norm: 1.0
176
+ optim: adamw_torch
177
+ start_from_checkpoint: /data/base_model
178
+ skip_weight_loading: false
179
+ tf32: true
180
+ fp16: false
181
+ bf16: true
182
+ eval_bf16: true
183
+ logging_steps: 10
184
+ save_steps: 20000
185
+ save_total_limit: 2
186
+ save_vl_model: false
187
+ save_only_model: true
188
+ upload_checkpoints: false
189
+ upload_every: 1000
190
+ upload_last_n_checkpoints: 5
191
+ max_concurrent_uploads: 2
192
+ eval_strategy: 'no'
193
+ eval_steps: 500
194
+ eval_set_split_ratio: 0.1
195
+ eval_batch_size: 2
196
+ save_best_eval_metric_name: ''
197
+ save_best_eval_metric_greater_is_better: true
198
+ deepspeed_stage: 2
199
+ gradient_checkpointing: false
200
+ transformers_trust_remote_code: true
201
+ transformers_local_files_only: false
202
+ transformers_cache_dir: null
203
+ transformers_access_token: null
204
+ use_ddp: false
205
+ ddp_bucket_cap_mb: 100
206
+ num_gpus: 1
207
+ dataloader_num_workers: 8
208
+ remove_unused_columns: false
209
+ use_wandb: false
210
+ wandb_project: finetune-gr00t-n1d7
211
+ enable_profiling: false
212
+ max_retries: 3
213
+ assert_loss_less_than: null
214
+ add_rl_callback: false
215
+ enable_open_loop_eval: false
216
+ open_loop_eval_traj_ids:
217
+ - 0
218
+ open_loop_eval_steps_per_traj: 100
219
+ open_loop_eval_plot_indices: null
220
+ max_steps: 80000
221
+ save_steps: 20000
*.pth/experiment_cfg/config.yaml ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - /data/dataset
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: new_embodiment
10
+ mix_ratio: 1.0
11
+ val_dataset_path: null
12
+ download_cache: false
13
+ episode_sampling_rate: 0.1
14
+ image_crop_size:
15
+ - 244
16
+ - 244
17
+ image_target_size:
18
+ - 224
19
+ - 224
20
+ mock_dataset_mode: false
21
+ modality_configs:
22
+ new_embodiment:
23
+ action: !!python/object:gr00t.data.types.ModalityConfig
24
+ action_configs:
25
+ - !!python/object:gr00t.data.types.ActionConfig
26
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
27
+ - default
28
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
29
+ - absolute
30
+ state_key: null
31
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
32
+ - non_eef
33
+ - !!python/object:gr00t.data.types.ActionConfig
34
+ format: *id001
35
+ rep: *id002
36
+ state_key: null
37
+ type: *id003
38
+ - !!python/object:gr00t.data.types.ActionConfig
39
+ format: *id001
40
+ rep: *id002
41
+ state_key: null
42
+ type: *id003
43
+ delta_indices:
44
+ - 0
45
+ - 1
46
+ - 2
47
+ - 3
48
+ - 4
49
+ - 5
50
+ - 6
51
+ - 7
52
+ - 8
53
+ - 9
54
+ - 10
55
+ - 11
56
+ - 12
57
+ - 13
58
+ - 14
59
+ - 15
60
+ mean_std_embedding_keys: null
61
+ modality_keys:
62
+ - arm_left
63
+ - arm_right
64
+ - odometry
65
+ sin_cos_embedding_keys: null
66
+ language: !!python/object:gr00t.data.types.ModalityConfig
67
+ action_configs: null
68
+ delta_indices:
69
+ - 0
70
+ mean_std_embedding_keys: null
71
+ modality_keys:
72
+ - annotation.human.task_description
73
+ sin_cos_embedding_keys: null
74
+ state: !!python/object:gr00t.data.types.ModalityConfig
75
+ action_configs: null
76
+ delta_indices:
77
+ - 0
78
+ mean_std_embedding_keys: null
79
+ modality_keys:
80
+ - arm_left
81
+ - arm_right
82
+ - odometry
83
+ sin_cos_embedding_keys: null
84
+ video: !!python/object:gr00t.data.types.ModalityConfig
85
+ action_configs: null
86
+ delta_indices:
87
+ - 0
88
+ mean_std_embedding_keys: null
89
+ modality_keys:
90
+ - cam_left_head
91
+ - cam_left_wrist
92
+ - cam_right_wrist
93
+ sin_cos_embedding_keys: null
94
+ mode: single_turn
95
+ multiprocessing_context: fork
96
+ num_shards_per_epoch: 100000
97
+ override_pretraining_statistics: true
98
+ random_chop: 0.0
99
+ seed: 42
100
+ shard_size: 1024
101
+ shuffle: true
102
+ subsample_ratio: 1.0
103
+ video_backend: torchcodec
104
+ load_config_path: null
105
+ model: !!python/object:gr00t.configs.model.gr00t_n1d7.Gr00tN1d7Config
106
+ _attn_implementation_internal: null
107
+ _commit_hash: null
108
+ _name_or_path: ''
109
+ _output_attentions: false
110
+ add_cross_attention: false
111
+ architectures: null
112
+ backbone_trainable_params_fp32: true
113
+ bad_words_ids: null
114
+ begin_suppress_tokens: null
115
+ bos_token_id: null
116
+ chunk_size_feed_forward: 0
117
+ clip_outliers: true
118
+ color_jitter_params: null
119
+ cross_attention_hidden_size: null
120
+ decoder_start_token_id: null
121
+ diffusion_model_cfg:
122
+ attention_head_dim: 48
123
+ dropout: 0.2
124
+ final_dropout: true
125
+ interleave_self_attention: true
126
+ norm_type: ada_norm
127
+ num_attention_heads: 32
128
+ num_layers: 16
129
+ output_dim: 1024
130
+ positional_embeddings: null
131
+ diversity_penalty: 0.0
132
+ do_sample: false
133
+ dtype: null
134
+ early_stopping: false
135
+ encoder_no_repeat_ngram_size: 0
136
+ eos_token_id: null
137
+ exponential_decay_length_penalty: null
138
+ extra_augmentation_config: null
139
+ finetuning_task: null
140
+ forced_bos_token_id: null
141
+ forced_eos_token_id: null
142
+ id2label:
143
+ 0: LABEL_0
144
+ 1: LABEL_1
145
+ is_decoder: false
146
+ is_encoder_decoder: false
147
+ label2id:
148
+ LABEL_0: 0
149
+ LABEL_1: 1
150
+ length_penalty: 1.0
151
+ load_bf16: false
152
+ max_length: 20
153
+ min_length: 0
154
+ model_name: nvidia/Cosmos-Reason2-2B
155
+ no_repeat_ngram_size: 0
156
+ num_beam_groups: 1
157
+ num_beams: 1
158
+ num_return_sequences: 1
159
+ output_hidden_states: false
160
+ output_scores: false
161
+ pad_token_id: null
162
+ prefix: null
163
+ problem_type: null
164
+ pruned_heads: {}
165
+ random_rotation_angle: null
166
+ remove_invalid_values: false
167
+ repetition_penalty: 1.0
168
+ reproject_vision: false
169
+ return_dict: true
170
+ return_dict_in_generate: false
171
+ sep_token_id: null
172
+ state_dropout_prob: 0.2
173
+ suppress_tokens: null
174
+ task_specific_params: null
175
+ temperature: 1.0
176
+ tf_legacy_loss: false
177
+ tie_encoder_decoder: false
178
+ tie_word_embeddings: true
179
+ tokenizer_class: null
180
+ top_k: 50
181
+ top_p: 1.0
182
+ torchscript: false
183
+ transformers_version: null
184
+ tune_diffusion_model: true
185
+ tune_llm: false
186
+ tune_projector: true
187
+ tune_visual: false
188
+ typical_p: 1.0
189
+ use_bfloat16: false
190
+ use_percentiles: false
191
+ use_relative_action: true
192
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
193
+ add_rl_callback: false
194
+ assert_loss_less_than: null
195
+ batch_size: null
196
+ bf16: true
197
+ dataloader_num_workers: 8
198
+ ddp_bucket_cap_mb: 100
199
+ deepspeed_stage: 2
200
+ enable_open_loop_eval: false
201
+ enable_profiling: false
202
+ eval_batch_size: 2
203
+ eval_bf16: true
204
+ eval_set_split_ratio: 0.1
205
+ eval_steps: 500
206
+ eval_strategy: 'no'
207
+ experiment_name: task0025_subtask_lang_final_layer_vlm_threecam_minmax_80k_gpu1_20260618_045240
208
+ fp16: false
209
+ global_batch_size: 8
210
+ gradient_accumulation_steps: 1
211
+ gradient_checkpointing: false
212
+ learning_rate: 0.0001
213
+ logging_steps: 10
214
+ lr_scheduler_type: cosine
215
+ max_concurrent_uploads: 2
216
+ max_grad_norm: 1.0
217
+ max_retries: 3
218
+ max_steps: 80000
219
+ num_gpus: 1
220
+ open_loop_eval_plot_indices: null
221
+ open_loop_eval_steps_per_traj: 100
222
+ open_loop_eval_traj_ids:
223
+ - 0
224
+ optim: adamw_torch
225
+ output_dir: /data/checkpoints/task0025_subtask_lang_final_layer_vlm_threecam_minmax_80k_gpu1_20260618_045240
226
+ remove_unused_columns: false
227
+ save_best_eval_metric_greater_is_better: true
228
+ save_best_eval_metric_name: ''
229
+ save_only_model: true
230
+ save_steps: 20000
231
+ save_total_limit: 2
232
+ save_vl_model: false
233
+ skip_weight_loading: false
234
+ start_from_checkpoint: /data/base_model
235
+ tf32: true
236
+ transformers_access_token: null
237
+ transformers_cache_dir: null
238
+ transformers_local_files_only: false
239
+ transformers_trust_remote_code: true
240
+ upload_checkpoints: false
241
+ upload_every: 1000
242
+ upload_last_n_checkpoints: 5
243
+ use_ddp: false
244
+ use_wandb: false
245
+ wandb_project: finetune-gr00t-n1d7
246
+ warmup_ratio: 0.05
247
+ warmup_steps: 0
248
+ weight_decay: 1.0e-05
*.pth/experiment_cfg/dataset_statistics.json ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "state": {
4
+ "arm_left": {
5
+ "min": [
6
+ -0.5475472807884216,
7
+ -0.0003355582884978503,
8
+ -1.3053457736968994,
9
+ -2.6803677082061768,
10
+ -1.6582931280136108,
11
+ -0.6229879260063171,
12
+ -1.4020036458969116,
13
+ -0.0015339808305725455
14
+ ],
15
+ "max": [
16
+ 1.113825798034668,
17
+ 0.8176237344741821,
18
+ 0.23757527768611908,
19
+ -0.5585607290267944,
20
+ 1.0988095998764038,
21
+ 1.172776222229004,
22
+ 1.3638184070587158,
23
+ 1.0921943187713623
24
+ ],
25
+ "mean": [
26
+ 0.35580819845199585,
27
+ 0.5201765298843384,
28
+ -0.1417115330696106,
29
+ -1.9100525379180908,
30
+ 0.06096320226788521,
31
+ 0.3721380829811096,
32
+ -0.7635477185249329,
33
+ 0.2815868854522705
34
+ ],
35
+ "std": [
36
+ 0.4073781967163086,
37
+ 0.23927268385887146,
38
+ 0.2344869077205658,
39
+ 0.48143312335014343,
40
+ 0.5435206294059753,
41
+ 0.27136287093162537,
42
+ 0.7339828610420227,
43
+ 0.41869908571243286
44
+ ],
45
+ "q01": [
46
+ -0.3928197732567787,
47
+ 0.026710080318152904,
48
+ -0.9970582705736161,
49
+ -2.6524085998535156,
50
+ -1.257974841594696,
51
+ -0.20474184960126876,
52
+ -1.3634459972381592,
53
+ 0.0
54
+ ],
55
+ "q99": [
56
+ 0.9335210216045381,
57
+ 0.8175997734069824,
58
+ 0.09205574311316017,
59
+ -0.7039075952768324,
60
+ 0.8392134261131291,
61
+ 0.997027575969696,
62
+ 1.1889616501331333,
63
+ 1.0477088689804077
64
+ ]
65
+ },
66
+ "arm_right": {
67
+ "min": [
68
+ -0.09659285098314285,
69
+ -0.8176716566085815,
70
+ -0.3275049030780792,
71
+ -2.6530556678771973,
72
+ -0.4579651653766632,
73
+ -0.3955034017562866,
74
+ -0.1349928081035614,
75
+ 0.0
76
+ ],
77
+ "max": [
78
+ 0.8757112622261047,
79
+ -0.08124106377363205,
80
+ 1.1064674854278564,
81
+ -1.2425124645233154,
82
+ 1.333017349243164,
83
+ 0.8877794146537781,
84
+ 1.3636425733566284,
85
+ 0.8682331442832947
86
+ ],
87
+ "mean": [
88
+ 0.5309473276138306,
89
+ -0.6017705202102661,
90
+ 0.07096648961305618,
91
+ -2.2108590602874756,
92
+ 0.26930341124534607,
93
+ 0.3429528474807739,
94
+ 1.2567896842956543,
95
+ 0.04772842675447464
96
+ ],
97
+ "std": [
98
+ 0.3694664239883423,
99
+ 0.16356231272220612,
100
+ 0.16321000456809998,
101
+ 0.4757940173149109,
102
+ 0.4936450123786926,
103
+ 0.19132843613624573,
104
+ 0.2600846588611603,
105
+ 0.14340612292289734
106
+ ],
107
+ "q01": [
108
+ 0.030643662437796593,
109
+ -0.8176237344741821,
110
+ -0.08104931563138962,
111
+ -2.652324676513672,
112
+ -0.37889325618743896,
113
+ -0.11650907695293426,
114
+ 0.21011178195476532,
115
+ 0.0
116
+ ],
117
+ "q99": [
118
+ 0.859029233455658,
119
+ -0.16543731629848477,
120
+ 0.8615535879135133,
121
+ -1.5723973023891449,
122
+ 1.0613687527179718,
123
+ 0.6687612211704257,
124
+ 1.3634459972381592,
125
+ 0.6028544306755066
126
+ ]
127
+ },
128
+ "odometry": {
129
+ "min": [
130
+ -0.015251385979354382,
131
+ -0.3666536211967468,
132
+ -0.24855561554431915
133
+ ],
134
+ "max": [
135
+ 0.28565824031829834,
136
+ 0.019846245646476746,
137
+ 0.072826087474823
138
+ ],
139
+ "mean": [
140
+ 4.3003801692975685e-05,
141
+ -0.027965428307652473,
142
+ -7.66340053814929e-06
143
+ ],
144
+ "std": [
145
+ 0.002784445881843567,
146
+ 0.09051357954740524,
147
+ 0.002256158972159028
148
+ ],
149
+ "q01": [
150
+ -3.177549693646142e-05,
151
+ -0.3368525901436806,
152
+ -0.0052148745534941554
153
+ ],
154
+ "q99": [
155
+ 3.881613905832974e-05,
156
+ 0.0012015580607112498,
157
+ 0.005100751668214812
158
+ ]
159
+ }
160
+ },
161
+ "action": {
162
+ "arm_left": {
163
+ "min": [
164
+ -0.5476311445236206,
165
+ 0.0,
166
+ -1.3054176568984985,
167
+ -2.682774782180786,
168
+ -1.6628351211547852,
169
+ -0.6258641481399536,
170
+ -1.402058482170105,
171
+ 0.0
172
+ ],
173
+ "max": [
174
+ 1.1172550916671753,
175
+ 0.8175997734069824,
176
+ 0.2411240190267563,
177
+ -0.5568350553512573,
178
+ 1.0998642444610596,
179
+ 1.1734952926635742,
180
+ 1.4005244970321655,
181
+ 1.100000023841858
182
+ ],
183
+ "mean": [
184
+ 0.35810500383377075,
185
+ 0.519511342048645,
186
+ -0.14196135103702545,
187
+ -1.9126713275909424,
188
+ 0.058261577039957047,
189
+ 0.37303486466407776,
190
+ -0.763543426990509,
191
+ 0.29336875677108765
192
+ ],
193
+ "std": [
194
+ 0.40803930163383484,
195
+ 0.23888689279556274,
196
+ 0.23461289703845978,
197
+ 0.4830642342567444,
198
+ 0.545210599899292,
199
+ 0.2715252637863159,
200
+ 0.7340782880783081,
201
+ 0.44709646701812744
202
+ ],
203
+ "q01": [
204
+ -0.3942330479621887,
205
+ 0.026077672839164734,
206
+ -0.9986214637756348,
207
+ -2.652252674102783,
208
+ -1.2593982219696045,
209
+ -0.2070874124765396,
210
+ -1.3634459972381592,
211
+ 0.0
212
+ ],
213
+ "q99": [
214
+ 0.9341943264007568,
215
+ 0.8175997734069824,
216
+ 0.09348710186779549,
217
+ -0.7034682774543709,
218
+ 0.8390874862670898,
219
+ 0.9986214637756348,
220
+ 1.1903691291809082,
221
+ 1.100000023841858
222
+ ]
223
+ },
224
+ "arm_right": {
225
+ "min": [
226
+ -0.09980137646198273,
227
+ -0.8175997734069824,
228
+ -0.32994207739830017,
229
+ -2.652252674102783,
230
+ -0.46202123165130615,
231
+ -0.3957670331001282,
232
+ -0.13499030470848083,
233
+ 0.0
234
+ ],
235
+ "max": [
236
+ 0.8774369955062866,
237
+ -0.08130098134279251,
238
+ 1.1075341701507568,
239
+ -1.2394565343856812,
240
+ 1.3330292701721191,
241
+ 0.8881748914718628,
242
+ 1.3634459972381592,
243
+ 0.8795690536499023
244
+ ],
245
+ "mean": [
246
+ 0.5329897999763489,
247
+ -0.6011292338371277,
248
+ 0.07123955339193344,
249
+ -2.2133889198303223,
250
+ 0.27188241481781006,
251
+ 0.3438069224357605,
252
+ 1.2567932605743408,
253
+ 0.04718247056007385
254
+ ],
255
+ "std": [
256
+ 0.3692541718482971,
257
+ 0.1632792353630066,
258
+ 0.16333064436912537,
259
+ 0.4753437936306,
260
+ 0.4933122396469116,
261
+ 0.1913645714521408,
262
+ 0.2600988447666168,
263
+ 0.14372898638248444
264
+ ],
265
+ "q01": [
266
+ 0.030667630955576897,
267
+ -0.8175997734069824,
268
+ -0.08166213139891625,
269
+ -2.652252674102783,
270
+ -0.3788812756538391,
271
+ -0.11658254265785217,
272
+ 0.21015536785125732,
273
+ 0.0
274
+ ],
275
+ "q99": [
276
+ 0.8590052723884583,
277
+ -0.16566991806030273,
278
+ 0.8620972037315369,
279
+ -1.5707963705062866,
280
+ 1.0630487203598022,
281
+ 0.6688156127929688,
282
+ 1.3634459972381592,
283
+ 0.6062136888504028
284
+ ]
285
+ },
286
+ "odometry": {
287
+ "min": [
288
+ 0.0,
289
+ -0.330078125,
290
+ -0.4609375
291
+ ],
292
+ "max": [
293
+ 0.3072916567325592,
294
+ -0.0,
295
+ -0.0
296
+ ],
297
+ "mean": [
298
+ 6.987077358644456e-05,
299
+ -0.028174595907330513,
300
+ -9.38360062718857e-06
301
+ ],
302
+ "std": [
303
+ 0.004499922972172499,
304
+ 0.09222032129764557,
305
+ 0.002032066462561488
306
+ ],
307
+ "q01": [
308
+ 0.0,
309
+ -0.330078125,
310
+ -0.0
311
+ ],
312
+ "q99": [
313
+ 0.0,
314
+ 0.0,
315
+ 0.0
316
+ ]
317
+ }
318
+ },
319
+ "relative_action": {}
320
+ }
321
+ }
*.pth/experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d7",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Cosmos-Reason2-2B",
5
+ "backbone_model_type": "qwen",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 0,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": false,
15
+ "backbone_trainable_params_fp32": true,
16
+ "extra_augmentation_config": null,
17
+ "apply_sincos_state_encoding": false,
18
+ "use_percentiles": false,
19
+ "clip_outliers": true,
20
+ "use_relative_action": true,
21
+ "max_state_dim": 132,
22
+ "max_action_dim": 132,
23
+ "action_horizon": 40,
24
+ "hidden_size": 1024,
25
+ "input_embedding_dim": 1536,
26
+ "state_history_length": 1,
27
+ "add_pos_embed": true,
28
+ "attn_dropout": 0.2,
29
+ "use_vlln": true,
30
+ "max_seq_len": 1024,
31
+ "use_alternate_vl_dit": true,
32
+ "attend_text_every_n_blocks": 2,
33
+ "diffusion_model_cfg": {
34
+ "attention_head_dim": 48,
35
+ "dropout": 0.2,
36
+ "final_dropout": true,
37
+ "interleave_self_attention": true,
38
+ "norm_type": "ada_norm",
39
+ "num_attention_heads": 32,
40
+ "num_layers": 32,
41
+ "output_dim": 1024,
42
+ "positional_embeddings": null
43
+ },
44
+ "num_inference_timesteps": 4,
45
+ "noise_beta_alpha": 1.5,
46
+ "noise_beta_beta": 1.0,
47
+ "noise_s": 0.999,
48
+ "num_timestep_buckets": 1000,
49
+ "tune_projector": true,
50
+ "tune_diffusion_model": true,
51
+ "tune_vlln": true,
52
+ "state_dropout_prob": 0.2,
53
+ "exclude_state": false,
54
+ "use_mean_std": false,
55
+ "max_num_embodiments": 32
56
+ }
*.pth/experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
*.pth/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f2fe11062392310da5ef217d8143d9f870f0547c0e0985b54a87c5af861d184
3
+ size 4986649584
*.pth/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e509688d6023afda10e33c3f99919e24c04a2f53325425ff4d013734e84476c
3
+ size 4970792616
*.pth/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:837e279fac4c925b3878b80bebfdc68337022cc487b8ac611cc10cf0a63fdc1c
3
+ size 2618758696
*.pth/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
*.pth/processor_config.json ADDED
@@ -0,0 +1,1132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d7Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "real_g1_relative_eef_relative_joints": {
6
+ "video": {
7
+ "delta_indices": [
8
+ -20,
9
+ 0
10
+ ],
11
+ "modality_keys": [
12
+ "ego_view"
13
+ ],
14
+ "sin_cos_embedding_keys": null,
15
+ "mean_std_embedding_keys": null,
16
+ "action_configs": null
17
+ },
18
+ "state": {
19
+ "delta_indices": [
20
+ 0
21
+ ],
22
+ "modality_keys": [
23
+ "left_wrist_eef_9d",
24
+ "right_wrist_eef_9d",
25
+ "left_hand",
26
+ "right_hand",
27
+ "left_arm",
28
+ "right_arm",
29
+ "waist"
30
+ ],
31
+ "sin_cos_embedding_keys": null,
32
+ "mean_std_embedding_keys": null,
33
+ "action_configs": null
34
+ },
35
+ "action": {
36
+ "delta_indices": [
37
+ 0,
38
+ 1,
39
+ 2,
40
+ 3,
41
+ 4,
42
+ 5,
43
+ 6,
44
+ 7,
45
+ 8,
46
+ 9,
47
+ 10,
48
+ 11,
49
+ 12,
50
+ 13,
51
+ 14,
52
+ 15,
53
+ 16,
54
+ 17,
55
+ 18,
56
+ 19,
57
+ 20,
58
+ 21,
59
+ 22,
60
+ 23,
61
+ 24,
62
+ 25,
63
+ 26,
64
+ 27,
65
+ 28,
66
+ 29,
67
+ 30,
68
+ 31,
69
+ 32,
70
+ 33,
71
+ 34,
72
+ 35,
73
+ 36,
74
+ 37,
75
+ 38,
76
+ 39
77
+ ],
78
+ "modality_keys": [
79
+ "left_wrist_eef_9d",
80
+ "right_wrist_eef_9d",
81
+ "left_hand",
82
+ "right_hand",
83
+ "left_arm",
84
+ "right_arm",
85
+ "waist",
86
+ "base_height_command",
87
+ "navigate_command"
88
+ ],
89
+ "sin_cos_embedding_keys": null,
90
+ "mean_std_embedding_keys": null,
91
+ "action_configs": [
92
+ {
93
+ "rep": "RELATIVE",
94
+ "type": "EEF",
95
+ "format": "XYZ_ROT6D",
96
+ "state_key": "left_wrist_eef_9d"
97
+ },
98
+ {
99
+ "rep": "RELATIVE",
100
+ "type": "EEF",
101
+ "format": "XYZ_ROT6D",
102
+ "state_key": "right_wrist_eef_9d"
103
+ },
104
+ {
105
+ "rep": "ABSOLUTE",
106
+ "type": "NON_EEF",
107
+ "format": "DEFAULT",
108
+ "state_key": "left_hand"
109
+ },
110
+ {
111
+ "rep": "ABSOLUTE",
112
+ "type": "NON_EEF",
113
+ "format": "DEFAULT",
114
+ "state_key": "right_hand"
115
+ },
116
+ {
117
+ "rep": "RELATIVE",
118
+ "type": "NON_EEF",
119
+ "format": "DEFAULT",
120
+ "state_key": "left_arm"
121
+ },
122
+ {
123
+ "rep": "RELATIVE",
124
+ "type": "NON_EEF",
125
+ "format": "DEFAULT",
126
+ "state_key": "right_arm"
127
+ },
128
+ {
129
+ "rep": "ABSOLUTE",
130
+ "type": "NON_EEF",
131
+ "format": "DEFAULT",
132
+ "state_key": "waist"
133
+ },
134
+ {
135
+ "rep": "ABSOLUTE",
136
+ "type": "NON_EEF",
137
+ "format": "DEFAULT",
138
+ "state_key": "base_height_command"
139
+ },
140
+ {
141
+ "rep": "ABSOLUTE",
142
+ "type": "NON_EEF",
143
+ "format": "DEFAULT",
144
+ "state_key": "navigate_command"
145
+ }
146
+ ]
147
+ },
148
+ "language": {
149
+ "delta_indices": [
150
+ 0
151
+ ],
152
+ "modality_keys": [
153
+ "annotation.human.task_description"
154
+ ],
155
+ "sin_cos_embedding_keys": null,
156
+ "mean_std_embedding_keys": null,
157
+ "action_configs": null
158
+ }
159
+ },
160
+ "real_r1_pro_sharpa_relative_eef_mecka": {
161
+ "video": {
162
+ "delta_indices": [
163
+ -30,
164
+ 0
165
+ ],
166
+ "modality_keys": [
167
+ "ego_view_cropratio_res320x240_freq30"
168
+ ],
169
+ "sin_cos_embedding_keys": null,
170
+ "mean_std_embedding_keys": null,
171
+ "action_configs": null
172
+ },
173
+ "state": {
174
+ "delta_indices": [
175
+ 0
176
+ ],
177
+ "modality_keys": [
178
+ "left_wrist_eef",
179
+ "right_wrist_eef",
180
+ "left_hand_joints",
181
+ "right_hand_joints"
182
+ ],
183
+ "sin_cos_embedding_keys": null,
184
+ "mean_std_embedding_keys": null,
185
+ "action_configs": null
186
+ },
187
+ "action": {
188
+ "delta_indices": [
189
+ 0,
190
+ 1,
191
+ 2,
192
+ 3,
193
+ 4,
194
+ 5,
195
+ 6,
196
+ 7,
197
+ 8,
198
+ 9,
199
+ 10,
200
+ 11,
201
+ 12,
202
+ 13,
203
+ 14,
204
+ 15,
205
+ 16,
206
+ 17,
207
+ 18,
208
+ 19,
209
+ 20,
210
+ 21,
211
+ 22,
212
+ 23,
213
+ 24,
214
+ 25,
215
+ 26,
216
+ 27,
217
+ 28,
218
+ 29,
219
+ 30,
220
+ 31,
221
+ 32,
222
+ 33,
223
+ 34,
224
+ 35,
225
+ 36,
226
+ 37,
227
+ 38,
228
+ 39
229
+ ],
230
+ "modality_keys": [
231
+ "left_wrist_eef",
232
+ "right_wrist_eef",
233
+ "left_hand_joints",
234
+ "right_hand_joints"
235
+ ],
236
+ "sin_cos_embedding_keys": null,
237
+ "mean_std_embedding_keys": null,
238
+ "action_configs": [
239
+ {
240
+ "rep": "RELATIVE",
241
+ "type": "EEF",
242
+ "format": "XYZ_ROT6D",
243
+ "state_key": "left_wrist_eef"
244
+ },
245
+ {
246
+ "rep": "RELATIVE",
247
+ "type": "EEF",
248
+ "format": "XYZ_ROT6D",
249
+ "state_key": "right_wrist_eef"
250
+ },
251
+ {
252
+ "rep": "ABSOLUTE",
253
+ "type": "NON_EEF",
254
+ "format": "DEFAULT",
255
+ "state_key": "left_hand_joints"
256
+ },
257
+ {
258
+ "rep": "ABSOLUTE",
259
+ "type": "NON_EEF",
260
+ "format": "DEFAULT",
261
+ "state_key": "right_hand_joints"
262
+ }
263
+ ]
264
+ },
265
+ "language": {
266
+ "delta_indices": [
267
+ 0
268
+ ],
269
+ "modality_keys": [
270
+ "annotation.human.coarse_action"
271
+ ],
272
+ "sin_cos_embedding_keys": null,
273
+ "mean_std_embedding_keys": null,
274
+ "action_configs": null
275
+ }
276
+ },
277
+ "real_r1_pro_sharpa_relative_eef_human": {
278
+ "video": {
279
+ "delta_indices": [
280
+ -20,
281
+ 0
282
+ ],
283
+ "modality_keys": [
284
+ "ego_view_res320x240_freq20",
285
+ "left_wrist_view_res320x240_freq20",
286
+ "right_wrist_view_res320x240_freq20"
287
+ ],
288
+ "sin_cos_embedding_keys": null,
289
+ "mean_std_embedding_keys": null,
290
+ "action_configs": null
291
+ },
292
+ "state": {
293
+ "delta_indices": [
294
+ 0
295
+ ],
296
+ "modality_keys": [
297
+ "left_wrist_eef",
298
+ "right_wrist_eef",
299
+ "left_hand_joints",
300
+ "right_hand_joints"
301
+ ],
302
+ "sin_cos_embedding_keys": null,
303
+ "mean_std_embedding_keys": null,
304
+ "action_configs": null
305
+ },
306
+ "action": {
307
+ "delta_indices": [
308
+ 0,
309
+ 1,
310
+ 2,
311
+ 3,
312
+ 4,
313
+ 5,
314
+ 6,
315
+ 7,
316
+ 8,
317
+ 9,
318
+ 10,
319
+ 11,
320
+ 12,
321
+ 13,
322
+ 14,
323
+ 15,
324
+ 16,
325
+ 17,
326
+ 18,
327
+ 19,
328
+ 20,
329
+ 21,
330
+ 22,
331
+ 23,
332
+ 24,
333
+ 25,
334
+ 26,
335
+ 27,
336
+ 28,
337
+ 29,
338
+ 30,
339
+ 31,
340
+ 32,
341
+ 33,
342
+ 34,
343
+ 35,
344
+ 36,
345
+ 37,
346
+ 38,
347
+ 39
348
+ ],
349
+ "modality_keys": [
350
+ "left_wrist_eef",
351
+ "right_wrist_eef",
352
+ "left_hand_joints",
353
+ "right_hand_joints"
354
+ ],
355
+ "sin_cos_embedding_keys": null,
356
+ "mean_std_embedding_keys": null,
357
+ "action_configs": [
358
+ {
359
+ "rep": "RELATIVE",
360
+ "type": "EEF",
361
+ "format": "XYZ_ROT6D",
362
+ "state_key": "left_wrist_eef"
363
+ },
364
+ {
365
+ "rep": "RELATIVE",
366
+ "type": "EEF",
367
+ "format": "XYZ_ROT6D",
368
+ "state_key": "right_wrist_eef"
369
+ },
370
+ {
371
+ "rep": "ABSOLUTE",
372
+ "type": "NON_EEF",
373
+ "format": "DEFAULT",
374
+ "state_key": "left_hand_joints"
375
+ },
376
+ {
377
+ "rep": "ABSOLUTE",
378
+ "type": "NON_EEF",
379
+ "format": "DEFAULT",
380
+ "state_key": "right_hand_joints"
381
+ }
382
+ ]
383
+ },
384
+ "language": {
385
+ "delta_indices": [
386
+ 0
387
+ ],
388
+ "modality_keys": [
389
+ "annotation.human.coarse_action"
390
+ ],
391
+ "sin_cos_embedding_keys": null,
392
+ "mean_std_embedding_keys": null,
393
+ "action_configs": null
394
+ }
395
+ },
396
+ "real_r1_pro_sharpa_relative_eef": {
397
+ "video": {
398
+ "delta_indices": [
399
+ -20,
400
+ 0
401
+ ],
402
+ "modality_keys": [
403
+ "ego_view_res320x240_freq20",
404
+ "left_wrist_view_res320x240_freq20",
405
+ "right_wrist_view_res320x240_freq20"
406
+ ],
407
+ "sin_cos_embedding_keys": null,
408
+ "mean_std_embedding_keys": null,
409
+ "action_configs": null
410
+ },
411
+ "state": {
412
+ "delta_indices": [
413
+ 0
414
+ ],
415
+ "modality_keys": [
416
+ "left_wrist_eef",
417
+ "right_wrist_eef",
418
+ "left_hand_joints",
419
+ "right_hand_joints"
420
+ ],
421
+ "sin_cos_embedding_keys": null,
422
+ "mean_std_embedding_keys": null,
423
+ "action_configs": null
424
+ },
425
+ "action": {
426
+ "delta_indices": [
427
+ 0,
428
+ 1,
429
+ 2,
430
+ 3,
431
+ 4,
432
+ 5,
433
+ 6,
434
+ 7,
435
+ 8,
436
+ 9,
437
+ 10,
438
+ 11,
439
+ 12,
440
+ 13,
441
+ 14,
442
+ 15,
443
+ 16,
444
+ 17,
445
+ 18,
446
+ 19,
447
+ 20,
448
+ 21,
449
+ 22,
450
+ 23,
451
+ 24,
452
+ 25,
453
+ 26,
454
+ 27,
455
+ 28,
456
+ 29,
457
+ 30,
458
+ 31,
459
+ 32,
460
+ 33,
461
+ 34,
462
+ 35,
463
+ 36,
464
+ 37,
465
+ 38,
466
+ 39
467
+ ],
468
+ "modality_keys": [
469
+ "left_wrist_eef",
470
+ "right_wrist_eef",
471
+ "left_hand_joints",
472
+ "right_hand_joints"
473
+ ],
474
+ "sin_cos_embedding_keys": null,
475
+ "mean_std_embedding_keys": null,
476
+ "action_configs": [
477
+ {
478
+ "rep": "RELATIVE",
479
+ "type": "EEF",
480
+ "format": "XYZ_ROT6D",
481
+ "state_key": "left_wrist_eef"
482
+ },
483
+ {
484
+ "rep": "RELATIVE",
485
+ "type": "EEF",
486
+ "format": "XYZ_ROT6D",
487
+ "state_key": "right_wrist_eef"
488
+ },
489
+ {
490
+ "rep": "ABSOLUTE",
491
+ "type": "NON_EEF",
492
+ "format": "DEFAULT",
493
+ "state_key": "left_hand_joints"
494
+ },
495
+ {
496
+ "rep": "ABSOLUTE",
497
+ "type": "NON_EEF",
498
+ "format": "DEFAULT",
499
+ "state_key": "right_hand_joints"
500
+ }
501
+ ]
502
+ },
503
+ "language": {
504
+ "delta_indices": [
505
+ 0
506
+ ],
507
+ "modality_keys": [
508
+ "annotation.human.coarse_action"
509
+ ],
510
+ "sin_cos_embedding_keys": null,
511
+ "mean_std_embedding_keys": null,
512
+ "action_configs": null
513
+ }
514
+ },
515
+ "xdof_relative_eef_relative_joint": {
516
+ "video": {
517
+ "delta_indices": [
518
+ -30,
519
+ 0
520
+ ],
521
+ "modality_keys": [
522
+ "top_camera-images-rgb_320_240",
523
+ "left_camera-images-rgb_320_240",
524
+ "right_camera-images-rgb_320_240"
525
+ ],
526
+ "sin_cos_embedding_keys": null,
527
+ "mean_std_embedding_keys": null,
528
+ "action_configs": null
529
+ },
530
+ "state": {
531
+ "delta_indices": [
532
+ 0
533
+ ],
534
+ "modality_keys": [
535
+ "left_wrist_eef",
536
+ "right_wrist_eef",
537
+ "left_gripper_pos",
538
+ "right_gripper_pos",
539
+ "left_joint_pos",
540
+ "right_joint_pos"
541
+ ],
542
+ "sin_cos_embedding_keys": null,
543
+ "mean_std_embedding_keys": null,
544
+ "action_configs": null
545
+ },
546
+ "action": {
547
+ "delta_indices": [
548
+ 0,
549
+ 1,
550
+ 2,
551
+ 3,
552
+ 4,
553
+ 5,
554
+ 6,
555
+ 7,
556
+ 8,
557
+ 9,
558
+ 10,
559
+ 11,
560
+ 12,
561
+ 13,
562
+ 14,
563
+ 15,
564
+ 16,
565
+ 17,
566
+ 18,
567
+ 19,
568
+ 20,
569
+ 21,
570
+ 22,
571
+ 23,
572
+ 24,
573
+ 25,
574
+ 26,
575
+ 27,
576
+ 28,
577
+ 29,
578
+ 30,
579
+ 31,
580
+ 32,
581
+ 33,
582
+ 34,
583
+ 35,
584
+ 36,
585
+ 37,
586
+ 38,
587
+ 39
588
+ ],
589
+ "modality_keys": [
590
+ "left_wrist_eef",
591
+ "right_wrist_eef",
592
+ "left_gripper_pos",
593
+ "right_gripper_pos",
594
+ "left_joint_pos",
595
+ "right_joint_pos"
596
+ ],
597
+ "sin_cos_embedding_keys": null,
598
+ "mean_std_embedding_keys": null,
599
+ "action_configs": [
600
+ {
601
+ "rep": "RELATIVE",
602
+ "type": "EEF",
603
+ "format": "XYZ_ROT6D",
604
+ "state_key": "left_wrist_eef"
605
+ },
606
+ {
607
+ "rep": "RELATIVE",
608
+ "type": "EEF",
609
+ "format": "XYZ_ROT6D",
610
+ "state_key": "right_wrist_eef"
611
+ },
612
+ {
613
+ "rep": "ABSOLUTE",
614
+ "type": "NON_EEF",
615
+ "format": "DEFAULT",
616
+ "state_key": "left_gripper_pos"
617
+ },
618
+ {
619
+ "rep": "ABSOLUTE",
620
+ "type": "NON_EEF",
621
+ "format": "DEFAULT",
622
+ "state_key": "right_gripper_pos"
623
+ },
624
+ {
625
+ "rep": "RELATIVE",
626
+ "type": "NON_EEF",
627
+ "format": "DEFAULT",
628
+ "state_key": "left_joint_pos"
629
+ },
630
+ {
631
+ "rep": "RELATIVE",
632
+ "type": "NON_EEF",
633
+ "format": "DEFAULT",
634
+ "state_key": "right_joint_pos"
635
+ }
636
+ ]
637
+ },
638
+ "language": {
639
+ "delta_indices": [
640
+ 0
641
+ ],
642
+ "modality_keys": [
643
+ "annotation.task"
644
+ ],
645
+ "sin_cos_embedding_keys": null,
646
+ "mean_std_embedding_keys": null,
647
+ "action_configs": null
648
+ }
649
+ },
650
+ "real_r1_pro_sharpa_relative_eef_maxinsights": {
651
+ "video": {
652
+ "delta_indices": [
653
+ -30,
654
+ 0
655
+ ],
656
+ "modality_keys": [
657
+ "ego_view_cropratio_res320x240_freq30"
658
+ ],
659
+ "sin_cos_embedding_keys": null,
660
+ "mean_std_embedding_keys": null,
661
+ "action_configs": null
662
+ },
663
+ "state": {
664
+ "delta_indices": [
665
+ 0
666
+ ],
667
+ "modality_keys": [
668
+ "left_wrist_eef",
669
+ "right_wrist_eef",
670
+ "left_hand_joints",
671
+ "right_hand_joints"
672
+ ],
673
+ "sin_cos_embedding_keys": null,
674
+ "mean_std_embedding_keys": null,
675
+ "action_configs": null
676
+ },
677
+ "action": {
678
+ "delta_indices": [
679
+ 0,
680
+ 1,
681
+ 2,
682
+ 3,
683
+ 4,
684
+ 5,
685
+ 6,
686
+ 7,
687
+ 8,
688
+ 9,
689
+ 10,
690
+ 11,
691
+ 12,
692
+ 13,
693
+ 14,
694
+ 15,
695
+ 16,
696
+ 17,
697
+ 18,
698
+ 19,
699
+ 20,
700
+ 21,
701
+ 22,
702
+ 23,
703
+ 24,
704
+ 25,
705
+ 26,
706
+ 27,
707
+ 28,
708
+ 29,
709
+ 30,
710
+ 31,
711
+ 32,
712
+ 33,
713
+ 34,
714
+ 35,
715
+ 36,
716
+ 37,
717
+ 38,
718
+ 39
719
+ ],
720
+ "modality_keys": [
721
+ "left_wrist_eef",
722
+ "right_wrist_eef",
723
+ "left_hand_joints",
724
+ "right_hand_joints"
725
+ ],
726
+ "sin_cos_embedding_keys": null,
727
+ "mean_std_embedding_keys": null,
728
+ "action_configs": [
729
+ {
730
+ "rep": "RELATIVE",
731
+ "type": "EEF",
732
+ "format": "XYZ_ROT6D",
733
+ "state_key": "left_wrist_eef"
734
+ },
735
+ {
736
+ "rep": "RELATIVE",
737
+ "type": "EEF",
738
+ "format": "XYZ_ROT6D",
739
+ "state_key": "right_wrist_eef"
740
+ },
741
+ {
742
+ "rep": "ABSOLUTE",
743
+ "type": "NON_EEF",
744
+ "format": "DEFAULT",
745
+ "state_key": "left_hand_joints"
746
+ },
747
+ {
748
+ "rep": "ABSOLUTE",
749
+ "type": "NON_EEF",
750
+ "format": "DEFAULT",
751
+ "state_key": "right_hand_joints"
752
+ }
753
+ ]
754
+ },
755
+ "language": {
756
+ "delta_indices": [
757
+ 0
758
+ ],
759
+ "modality_keys": [
760
+ "annotation.human.coarse_action"
761
+ ],
762
+ "sin_cos_embedding_keys": null,
763
+ "mean_std_embedding_keys": null,
764
+ "action_configs": null
765
+ }
766
+ },
767
+ "xdof_relative_eef_relative_joint_subtask": {
768
+ "video": {
769
+ "delta_indices": [
770
+ -30,
771
+ 0
772
+ ],
773
+ "modality_keys": [
774
+ "top_camera-images-rgb_320_240",
775
+ "left_camera-images-rgb_320_240",
776
+ "right_camera-images-rgb_320_240"
777
+ ],
778
+ "sin_cos_embedding_keys": null,
779
+ "mean_std_embedding_keys": null,
780
+ "action_configs": null
781
+ },
782
+ "state": {
783
+ "delta_indices": [
784
+ 0
785
+ ],
786
+ "modality_keys": [
787
+ "left_wrist_eef",
788
+ "right_wrist_eef",
789
+ "left_gripper_pos",
790
+ "right_gripper_pos",
791
+ "left_joint_pos",
792
+ "right_joint_pos"
793
+ ],
794
+ "sin_cos_embedding_keys": null,
795
+ "mean_std_embedding_keys": null,
796
+ "action_configs": null
797
+ },
798
+ "action": {
799
+ "delta_indices": [
800
+ 0,
801
+ 1,
802
+ 2,
803
+ 3,
804
+ 4,
805
+ 5,
806
+ 6,
807
+ 7,
808
+ 8,
809
+ 9,
810
+ 10,
811
+ 11,
812
+ 12,
813
+ 13,
814
+ 14,
815
+ 15,
816
+ 16,
817
+ 17,
818
+ 18,
819
+ 19,
820
+ 20,
821
+ 21,
822
+ 22,
823
+ 23,
824
+ 24,
825
+ 25,
826
+ 26,
827
+ 27,
828
+ 28,
829
+ 29,
830
+ 30,
831
+ 31,
832
+ 32,
833
+ 33,
834
+ 34,
835
+ 35,
836
+ 36,
837
+ 37,
838
+ 38,
839
+ 39
840
+ ],
841
+ "modality_keys": [
842
+ "left_wrist_eef",
843
+ "right_wrist_eef",
844
+ "left_gripper_pos",
845
+ "right_gripper_pos",
846
+ "left_joint_pos",
847
+ "right_joint_pos"
848
+ ],
849
+ "sin_cos_embedding_keys": null,
850
+ "mean_std_embedding_keys": null,
851
+ "action_configs": [
852
+ {
853
+ "rep": "RELATIVE",
854
+ "type": "EEF",
855
+ "format": "XYZ_ROT6D",
856
+ "state_key": "left_wrist_eef"
857
+ },
858
+ {
859
+ "rep": "RELATIVE",
860
+ "type": "EEF",
861
+ "format": "XYZ_ROT6D",
862
+ "state_key": "right_wrist_eef"
863
+ },
864
+ {
865
+ "rep": "ABSOLUTE",
866
+ "type": "NON_EEF",
867
+ "format": "DEFAULT",
868
+ "state_key": "left_gripper_pos"
869
+ },
870
+ {
871
+ "rep": "ABSOLUTE",
872
+ "type": "NON_EEF",
873
+ "format": "DEFAULT",
874
+ "state_key": "right_gripper_pos"
875
+ },
876
+ {
877
+ "rep": "RELATIVE",
878
+ "type": "NON_EEF",
879
+ "format": "DEFAULT",
880
+ "state_key": "left_joint_pos"
881
+ },
882
+ {
883
+ "rep": "RELATIVE",
884
+ "type": "NON_EEF",
885
+ "format": "DEFAULT",
886
+ "state_key": "right_joint_pos"
887
+ }
888
+ ]
889
+ },
890
+ "language": {
891
+ "delta_indices": [
892
+ 0
893
+ ],
894
+ "modality_keys": [
895
+ "annotation.sub_task"
896
+ ],
897
+ "sin_cos_embedding_keys": null,
898
+ "mean_std_embedding_keys": null,
899
+ "action_configs": null
900
+ }
901
+ },
902
+ "oxe_droid_relative_eef_relative_joint": {
903
+ "video": {
904
+ "delta_indices": [
905
+ -15,
906
+ 0
907
+ ],
908
+ "modality_keys": [
909
+ "exterior_image_1_left",
910
+ "wrist_image_left"
911
+ ],
912
+ "sin_cos_embedding_keys": null,
913
+ "mean_std_embedding_keys": null,
914
+ "action_configs": null
915
+ },
916
+ "state": {
917
+ "delta_indices": [
918
+ 0
919
+ ],
920
+ "modality_keys": [
921
+ "eef_9d",
922
+ "gripper_position",
923
+ "joint_position"
924
+ ],
925
+ "sin_cos_embedding_keys": null,
926
+ "mean_std_embedding_keys": null,
927
+ "action_configs": null
928
+ },
929
+ "action": {
930
+ "delta_indices": [
931
+ 0,
932
+ 1,
933
+ 2,
934
+ 3,
935
+ 4,
936
+ 5,
937
+ 6,
938
+ 7,
939
+ 8,
940
+ 9,
941
+ 10,
942
+ 11,
943
+ 12,
944
+ 13,
945
+ 14,
946
+ 15,
947
+ 16,
948
+ 17,
949
+ 18,
950
+ 19,
951
+ 20,
952
+ 21,
953
+ 22,
954
+ 23,
955
+ 24,
956
+ 25,
957
+ 26,
958
+ 27,
959
+ 28,
960
+ 29,
961
+ 30,
962
+ 31,
963
+ 32,
964
+ 33,
965
+ 34,
966
+ 35,
967
+ 36,
968
+ 37,
969
+ 38,
970
+ 39
971
+ ],
972
+ "modality_keys": [
973
+ "eef_9d",
974
+ "gripper_position",
975
+ "joint_position"
976
+ ],
977
+ "sin_cos_embedding_keys": null,
978
+ "mean_std_embedding_keys": null,
979
+ "action_configs": [
980
+ {
981
+ "rep": "RELATIVE",
982
+ "type": "EEF",
983
+ "format": "XYZ_ROT6D",
984
+ "state_key": "eef_9d"
985
+ },
986
+ {
987
+ "rep": "ABSOLUTE",
988
+ "type": "NON_EEF",
989
+ "format": "DEFAULT",
990
+ "state_key": "gripper_position"
991
+ },
992
+ {
993
+ "rep": "RELATIVE",
994
+ "type": "NON_EEF",
995
+ "format": "DEFAULT",
996
+ "state_key": "joint_position"
997
+ }
998
+ ]
999
+ },
1000
+ "language": {
1001
+ "delta_indices": [
1002
+ 0
1003
+ ],
1004
+ "modality_keys": [
1005
+ "annotation.language.language_instruction"
1006
+ ],
1007
+ "sin_cos_embedding_keys": null,
1008
+ "mean_std_embedding_keys": null,
1009
+ "action_configs": null
1010
+ }
1011
+ },
1012
+ "new_embodiment": {
1013
+ "video": {
1014
+ "delta_indices": [
1015
+ 0
1016
+ ],
1017
+ "modality_keys": [
1018
+ "cam_left_head",
1019
+ "cam_left_wrist",
1020
+ "cam_right_wrist"
1021
+ ],
1022
+ "sin_cos_embedding_keys": null,
1023
+ "mean_std_embedding_keys": null,
1024
+ "action_configs": null
1025
+ },
1026
+ "state": {
1027
+ "delta_indices": [
1028
+ 0
1029
+ ],
1030
+ "modality_keys": [
1031
+ "arm_left",
1032
+ "arm_right",
1033
+ "odometry"
1034
+ ],
1035
+ "sin_cos_embedding_keys": null,
1036
+ "mean_std_embedding_keys": null,
1037
+ "action_configs": null
1038
+ },
1039
+ "action": {
1040
+ "delta_indices": [
1041
+ 0,
1042
+ 1,
1043
+ 2,
1044
+ 3,
1045
+ 4,
1046
+ 5,
1047
+ 6,
1048
+ 7,
1049
+ 8,
1050
+ 9,
1051
+ 10,
1052
+ 11,
1053
+ 12,
1054
+ 13,
1055
+ 14,
1056
+ 15
1057
+ ],
1058
+ "modality_keys": [
1059
+ "arm_left",
1060
+ "arm_right",
1061
+ "odometry"
1062
+ ],
1063
+ "sin_cos_embedding_keys": null,
1064
+ "mean_std_embedding_keys": null,
1065
+ "action_configs": [
1066
+ {
1067
+ "rep": "ABSOLUTE",
1068
+ "type": "NON_EEF",
1069
+ "format": "DEFAULT",
1070
+ "state_key": null
1071
+ },
1072
+ {
1073
+ "rep": "ABSOLUTE",
1074
+ "type": "NON_EEF",
1075
+ "format": "DEFAULT",
1076
+ "state_key": null
1077
+ },
1078
+ {
1079
+ "rep": "ABSOLUTE",
1080
+ "type": "NON_EEF",
1081
+ "format": "DEFAULT",
1082
+ "state_key": null
1083
+ }
1084
+ ]
1085
+ },
1086
+ "language": {
1087
+ "delta_indices": [
1088
+ 0
1089
+ ],
1090
+ "modality_keys": [
1091
+ "annotation.human.task_description"
1092
+ ],
1093
+ "sin_cos_embedding_keys": null,
1094
+ "mean_std_embedding_keys": null,
1095
+ "action_configs": null
1096
+ }
1097
+ }
1098
+ },
1099
+ "image_crop_size": [
1100
+ 230,
1101
+ 230
1102
+ ],
1103
+ "image_target_size": [
1104
+ 256,
1105
+ 256
1106
+ ],
1107
+ "use_albumentations": true,
1108
+ "random_rotation_angle": 0,
1109
+ "color_jitter_params": {
1110
+ "brightness": 0.3,
1111
+ "contrast": 0.4,
1112
+ "saturation": 0.5,
1113
+ "hue": 0.08
1114
+ },
1115
+ "shortest_image_edge": 256,
1116
+ "crop_fraction": 0.95,
1117
+ "letter_box_transform": false,
1118
+ "model_name": "nvidia/Cosmos-Reason2-2B",
1119
+ "model_type": "qwen",
1120
+ "formalize_language": true,
1121
+ "max_state_dim": 132,
1122
+ "max_action_dim": 132,
1123
+ "max_action_horizon": 40,
1124
+ "use_percentiles": false,
1125
+ "use_mean_std": false,
1126
+ "clip_outliers": true,
1127
+ "apply_sincos_state_encoding": false,
1128
+ "use_relative_action": true,
1129
+ "exclude_state": false,
1130
+ "state_dropout_prob": 0.2
1131
+ }
1132
+ }
*.pth/statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
*.pth/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
*.pth/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb246b0cdd309f00fa6859feca6c4a2ff98aa9e477a9a92cab2526b1e53b864e
3
+ size 6161
*.pth/wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d7", "run_id": "task0025_subtask_lang_final_layer_vlm_threecam_minmax_80k_gpu1_20260618_045240"}