mingxinz commited on
Commit
7ab4492
·
verified ·
1 Parent(s): 833b08c

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": true,
5
+ "architectures": [
6
+ "Gr00tN1d6"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_model_type": "eagle",
12
+ "backbone_trainable_params_fp32": true,
13
+ "collator_overwrite_image_inputs": false,
14
+ "color_jitter_params": {
15
+ "brightness": 0.1,
16
+ "contrast": 0.1,
17
+ "hue": 0.1,
18
+ "saturation": 0.1
19
+ },
20
+ "crop_fraction": 0.95,
21
+ "diffusion_model_cfg": {
22
+ "attention_head_dim": 48,
23
+ "dropout": 0.2,
24
+ "final_dropout": true,
25
+ "interleave_self_attention": true,
26
+ "norm_type": "ada_norm",
27
+ "num_attention_heads": 32,
28
+ "num_layers": 32,
29
+ "output_dim": 1024,
30
+ "positional_embeddings": null
31
+ },
32
+ "eagle_collator": true,
33
+ "formalize_language": true,
34
+ "gemma_collator": false,
35
+ "hidden_size": 1024,
36
+ "image_crop_size": null,
37
+ "image_target_size": null,
38
+ "input_embedding_dim": 1536,
39
+ "load_bf16": true,
40
+ "max_action_dim": 128,
41
+ "max_num_embodiments": 32,
42
+ "max_seq_len": 1024,
43
+ "max_state_dim": 128,
44
+ "model_dtype": "bfloat16",
45
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
+ "model_type": "Gr00tN1d6",
47
+ "noise_beta_alpha": 1.5,
48
+ "noise_beta_beta": 1.0,
49
+ "noise_s": 0.999,
50
+ "num_inference_timesteps": 4,
51
+ "num_timestep_buckets": 1000,
52
+ "random_rotation_angle": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 16,
55
+ "shortest_image_edge": 256,
56
+ "state_dropout_prob": 0.0,
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.51.3",
59
+ "tune_diffusion_model": true,
60
+ "tune_llm": false,
61
+ "tune_projector": true,
62
+ "tune_top_llm_layers": 4,
63
+ "tune_visual": true,
64
+ "tune_vlln": true,
65
+ "use_albumentations_transforms": true,
66
+ "use_alternate_vl_dit": true,
67
+ "use_flash_attention": true,
68
+ "use_relative_action": true,
69
+ "use_vlln": true
70
+ }
embodiment_id.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "new_embodiment": 10
10
+ }
experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d6
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Eagle-Block2A-2B-v2
6
+ backbone_model_type: eagle
7
+ model_revision: null
8
+ tune_top_llm_layers: 4
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: true
12
+ select_layer: 16
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ collator_overwrite_image_inputs: false
17
+ eagle_collator: true
18
+ backbone_trainable_params_fp32: true
19
+ image_crop_size: null
20
+ image_target_size: null
21
+ shortest_image_edge: 256
22
+ crop_fraction: 0.95
23
+ random_rotation_angle: null
24
+ color_jitter_params:
25
+ brightness: 0.3
26
+ contrast: 0.4
27
+ saturation: 0.5
28
+ hue: 0.08
29
+ use_albumentations_transforms: true
30
+ formalize_language: true
31
+ apply_sincos_state_encoding: false
32
+ use_relative_action: true
33
+ max_state_dim: 29
34
+ max_action_dim: 29
35
+ action_horizon: 16
36
+ hidden_size: 1024
37
+ input_embedding_dim: 1536
38
+ add_pos_embed: true
39
+ attn_dropout: 0.2
40
+ use_vlln: true
41
+ max_seq_len: 1024
42
+ use_alternate_vl_dit: true
43
+ attend_text_every_n_blocks: 2
44
+ diffusion_model_cfg:
45
+ positional_embeddings: null
46
+ num_layers: 32
47
+ num_attention_heads: 32
48
+ attention_head_dim: 48
49
+ norm_type: ada_norm
50
+ dropout: 0.2
51
+ final_dropout: true
52
+ output_dim: 1024
53
+ interleave_self_attention: true
54
+ num_inference_timesteps: 4
55
+ noise_beta_alpha: 1.5
56
+ noise_beta_beta: 1.0
57
+ noise_s: 0.999
58
+ num_timestep_buckets: 1000
59
+ tune_projector: true
60
+ tune_diffusion_model: true
61
+ tune_vlln: true
62
+ state_dropout_prob: 0.0
63
+ state_additive_noise_scale: 0.0
64
+ max_num_embodiments: 32
65
+ data:
66
+ datasets:
67
+ - dataset_paths:
68
+ - /datasets/orca-sim-pick-and-place-mimic/stage1_3_cosmos/lerobot
69
+ - /datasets/orca-sim-pick-and-place-mimic/stage1_5_cosmos/lerobot
70
+ - /datasets/orca-sim-pick-and-place-mimic/stage1_7_cosmos/lerobot
71
+ - /datasets/orca-sim-pick-and-place-mimic/stage1_8_cosmos/lerobot
72
+ embodiment_tag: new_embodiment
73
+ mix_ratio: 1.0
74
+ dataset_type: physical_embodiment
75
+ val_dataset_path: null
76
+ modality_configs:
77
+ new_embodiment:
78
+ video:
79
+ delta_indices:
80
+ - 0
81
+ modality_keys:
82
+ - ego_view
83
+ sin_cos_embedding_keys: null
84
+ mean_std_embedding_keys: null
85
+ action_configs: null
86
+ state:
87
+ delta_indices:
88
+ - 0
89
+ modality_keys:
90
+ - left_arm
91
+ - right_arm
92
+ - left_hand
93
+ - right_hand
94
+ - waist
95
+ sin_cos_embedding_keys: null
96
+ mean_std_embedding_keys: null
97
+ action_configs: null
98
+ action:
99
+ delta_indices:
100
+ - 0
101
+ - 1
102
+ - 2
103
+ - 3
104
+ - 4
105
+ - 5
106
+ - 6
107
+ - 7
108
+ - 8
109
+ - 9
110
+ - 10
111
+ - 11
112
+ - 12
113
+ - 13
114
+ - 14
115
+ - 15
116
+ modality_keys:
117
+ - left_arm
118
+ - right_arm
119
+ - left_hand
120
+ - right_hand
121
+ - base_height_command
122
+ - navigate_command
123
+ sin_cos_embedding_keys: null
124
+ mean_std_embedding_keys: null
125
+ action_configs:
126
+ - rep: ABSOLUTE
127
+ type: NON_EEF
128
+ format: DEFAULT
129
+ state_key: null
130
+ - rep: ABSOLUTE
131
+ type: NON_EEF
132
+ format: DEFAULT
133
+ state_key: null
134
+ - rep: ABSOLUTE
135
+ type: NON_EEF
136
+ format: DEFAULT
137
+ state_key: null
138
+ - rep: ABSOLUTE
139
+ type: NON_EEF
140
+ format: DEFAULT
141
+ state_key: null
142
+ - rep: ABSOLUTE
143
+ type: NON_EEF
144
+ format: DEFAULT
145
+ state_key: null
146
+ - rep: ABSOLUTE
147
+ type: NON_EEF
148
+ format: DEFAULT
149
+ state_key: null
150
+ language:
151
+ delta_indices:
152
+ - 0
153
+ modality_keys:
154
+ - annotation.human.task_description
155
+ sin_cos_embedding_keys: null
156
+ mean_std_embedding_keys: null
157
+ action_configs: null
158
+ download_cache: false
159
+ shard_size: 1024
160
+ episode_sampling_rate: 0.1
161
+ num_shards_per_epoch: 100000
162
+ override_pretraining_statistics: false
163
+ mode: single_turn
164
+ random_chop: 0.0
165
+ mock_dataset_mode: false
166
+ shuffle: true
167
+ seed: 42
168
+ multiprocessing_context: fork
169
+ allow_padding: false
170
+ subsample_ratio: 1.0
171
+ image_crop_size:
172
+ - 244
173
+ - 244
174
+ image_target_size:
175
+ - 224
176
+ - 224
177
+ video_backend: torchcodec
178
+ training:
179
+ output_dir: /models/ORCA-GROOT-N1.6-Sim-Pick-Place
180
+ experiment_name: null
181
+ max_steps: 100000
182
+ global_batch_size: 32
183
+ batch_size: null
184
+ gradient_accumulation_steps: 1
185
+ learning_rate: 0.0001
186
+ lr_scheduler_type: cosine
187
+ weight_decay: 1.0e-05
188
+ warmup_ratio: 0.05
189
+ warmup_steps: 0
190
+ max_grad_norm: 1.0
191
+ optim: adamw_torch
192
+ start_from_checkpoint: nvidia/GR00T-N1.6-3B
193
+ tf32: true
194
+ fp16: false
195
+ bf16: true
196
+ eval_bf16: true
197
+ logging_steps: 10
198
+ save_steps: 10000
199
+ save_total_limit: 9
200
+ save_vl_model: false
201
+ upload_checkpoints: false
202
+ upload_every: 1000
203
+ upload_last_n_checkpoints: 5
204
+ max_concurrent_uploads: 2
205
+ eval_strategy: 'no'
206
+ eval_steps: 500
207
+ eval_set_split_ratio: 0.1
208
+ eval_batch_size: 2
209
+ save_best_eval_metric_name: ''
210
+ save_best_eval_metric_greater_is_better: true
211
+ deepspeed_stage: 2
212
+ gradient_checkpointing: false
213
+ transformers_trust_remote_code: true
214
+ transformers_local_files_only: false
215
+ transformers_cache_dir: null
216
+ transformers_access_token: null
217
+ use_ddp: false
218
+ ddp_bucket_cap_mb: 100
219
+ num_gpus: 1
220
+ dataloader_num_workers: 8
221
+ remove_unused_columns: false
222
+ use_wandb: true
223
+ wandb_project: finetune-gr00t-n1d6
224
+ enable_profiling: false
225
+ max_retries: 3
226
+ assert_loss_less_than: null
227
+ add_rl_callback: false
228
+ enable_open_loop_eval: false
229
+ open_loop_eval_traj_ids:
230
+ - 0
231
+ open_loop_eval_steps_per_traj: 100
232
+ open_loop_eval_plot_indices: null
233
+ max_steps: 100000
234
+ save_steps: 10000
experiment_cfg/config.yaml ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - /datasets/orca-sim-pick-and-place-mimic/stage1_3_cosmos/lerobot
8
+ - /datasets/orca-sim-pick-and-place-mimic/stage1_5_cosmos/lerobot
9
+ - /datasets/orca-sim-pick-and-place-mimic/stage1_7_cosmos/lerobot
10
+ - /datasets/orca-sim-pick-and-place-mimic/stage1_8_cosmos/lerobot
11
+ dataset_type: physical_embodiment
12
+ embodiment_tag: new_embodiment
13
+ mix_ratio: 1.0
14
+ val_dataset_path: null
15
+ download_cache: false
16
+ episode_sampling_rate: 0.1
17
+ image_crop_size:
18
+ - 244
19
+ - 244
20
+ image_target_size:
21
+ - 224
22
+ - 224
23
+ mock_dataset_mode: false
24
+ modality_configs:
25
+ new_embodiment:
26
+ action: !!python/object:gr00t.data.types.ModalityConfig
27
+ action_configs:
28
+ - !!python/object:gr00t.data.types.ActionConfig
29
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
30
+ - default
31
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
32
+ - absolute
33
+ state_key: null
34
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
35
+ - non_eef
36
+ - !!python/object:gr00t.data.types.ActionConfig
37
+ format: *id001
38
+ rep: *id002
39
+ state_key: null
40
+ type: *id003
41
+ - !!python/object:gr00t.data.types.ActionConfig
42
+ format: *id001
43
+ rep: *id002
44
+ state_key: null
45
+ type: *id003
46
+ - !!python/object:gr00t.data.types.ActionConfig
47
+ format: *id001
48
+ rep: *id002
49
+ state_key: null
50
+ type: *id003
51
+ - !!python/object:gr00t.data.types.ActionConfig
52
+ format: *id001
53
+ rep: *id002
54
+ state_key: null
55
+ type: *id003
56
+ - !!python/object:gr00t.data.types.ActionConfig
57
+ format: *id001
58
+ rep: *id002
59
+ state_key: null
60
+ type: *id003
61
+ delta_indices:
62
+ - 0
63
+ - 1
64
+ - 2
65
+ - 3
66
+ - 4
67
+ - 5
68
+ - 6
69
+ - 7
70
+ - 8
71
+ - 9
72
+ - 10
73
+ - 11
74
+ - 12
75
+ - 13
76
+ - 14
77
+ - 15
78
+ mean_std_embedding_keys: null
79
+ modality_keys:
80
+ - left_arm
81
+ - right_arm
82
+ - left_hand
83
+ - right_hand
84
+ - base_height_command
85
+ - navigate_command
86
+ sin_cos_embedding_keys: null
87
+ language: !!python/object:gr00t.data.types.ModalityConfig
88
+ action_configs: null
89
+ delta_indices:
90
+ - 0
91
+ mean_std_embedding_keys: null
92
+ modality_keys:
93
+ - annotation.human.task_description
94
+ sin_cos_embedding_keys: null
95
+ state: !!python/object:gr00t.data.types.ModalityConfig
96
+ action_configs: null
97
+ delta_indices:
98
+ - 0
99
+ mean_std_embedding_keys: null
100
+ modality_keys:
101
+ - left_arm
102
+ - right_arm
103
+ - left_hand
104
+ - right_hand
105
+ - waist
106
+ sin_cos_embedding_keys: null
107
+ video: !!python/object:gr00t.data.types.ModalityConfig
108
+ action_configs: null
109
+ delta_indices:
110
+ - 0
111
+ mean_std_embedding_keys: null
112
+ modality_keys:
113
+ - ego_view
114
+ sin_cos_embedding_keys: null
115
+ mode: single_turn
116
+ multiprocessing_context: fork
117
+ num_shards_per_epoch: 100000
118
+ override_pretraining_statistics: false
119
+ random_chop: 0.0
120
+ seed: 42
121
+ shard_size: 1024
122
+ shuffle: true
123
+ subsample_ratio: 1.0
124
+ video_backend: torchcodec
125
+ load_config_path: null
126
+ model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
127
+ _attn_implementation_autoset: false
128
+ _attn_implementation_internal: null
129
+ _commit_hash: null
130
+ _name_or_path: ''
131
+ add_cross_attention: false
132
+ architectures: null
133
+ backbone_model_type: eagle
134
+ backbone_trainable_params_fp32: true
135
+ bad_words_ids: null
136
+ begin_suppress_tokens: null
137
+ bos_token_id: null
138
+ chunk_size_feed_forward: 0
139
+ color_jitter_params:
140
+ brightness: 0.3
141
+ contrast: 0.4
142
+ hue: 0.08
143
+ saturation: 0.5
144
+ cross_attention_hidden_size: null
145
+ decoder_start_token_id: null
146
+ diffusion_model_cfg:
147
+ attention_head_dim: 48
148
+ dropout: 0.2
149
+ final_dropout: true
150
+ interleave_self_attention: true
151
+ norm_type: ada_norm
152
+ num_attention_heads: 32
153
+ num_layers: 32
154
+ output_dim: 1024
155
+ positional_embeddings: null
156
+ diversity_penalty: 0.0
157
+ do_sample: false
158
+ eagle_collator: true
159
+ early_stopping: false
160
+ encoder_no_repeat_ngram_size: 0
161
+ eos_token_id: null
162
+ exponential_decay_length_penalty: null
163
+ finetuning_task: null
164
+ forced_bos_token_id: null
165
+ forced_eos_token_id: null
166
+ id2label:
167
+ 0: LABEL_0
168
+ 1: LABEL_1
169
+ is_decoder: false
170
+ is_encoder_decoder: false
171
+ label2id:
172
+ LABEL_0: 0
173
+ LABEL_1: 1
174
+ length_penalty: 1.0
175
+ load_bf16: false
176
+ max_length: 20
177
+ min_length: 0
178
+ model_name: nvidia/Eagle-Block2A-2B-v2
179
+ no_repeat_ngram_size: 0
180
+ num_beam_groups: 1
181
+ num_beams: 1
182
+ num_return_sequences: 1
183
+ output_attentions: false
184
+ output_hidden_states: false
185
+ output_scores: false
186
+ pad_token_id: null
187
+ prefix: null
188
+ problem_type: null
189
+ pruned_heads: {}
190
+ random_rotation_angle: null
191
+ remove_invalid_values: false
192
+ repetition_penalty: 1.0
193
+ reproject_vision: false
194
+ return_dict: true
195
+ return_dict_in_generate: false
196
+ sep_token_id: null
197
+ state_dropout_prob: 0.0
198
+ suppress_tokens: null
199
+ task_specific_params: null
200
+ temperature: 1.0
201
+ tf_legacy_loss: false
202
+ tie_encoder_decoder: false
203
+ tie_word_embeddings: true
204
+ tokenizer_class: null
205
+ top_k: 50
206
+ top_p: 1.0
207
+ torch_dtype: null
208
+ torchscript: false
209
+ transformers_version: null
210
+ tune_diffusion_model: true
211
+ tune_llm: false
212
+ tune_projector: true
213
+ tune_visual: true
214
+ typical_p: 1.0
215
+ use_bfloat16: false
216
+ use_relative_action: true
217
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
218
+ add_rl_callback: false
219
+ assert_loss_less_than: null
220
+ batch_size: null
221
+ bf16: true
222
+ dataloader_num_workers: 8
223
+ ddp_bucket_cap_mb: 100
224
+ deepspeed_stage: 2
225
+ enable_open_loop_eval: false
226
+ enable_profiling: false
227
+ eval_batch_size: 2
228
+ eval_bf16: true
229
+ eval_set_split_ratio: 0.1
230
+ eval_steps: 500
231
+ eval_strategy: 'no'
232
+ experiment_name: null
233
+ fp16: false
234
+ global_batch_size: 32
235
+ gradient_accumulation_steps: 1
236
+ gradient_checkpointing: false
237
+ learning_rate: 0.0001
238
+ logging_steps: 10
239
+ lr_scheduler_type: cosine
240
+ max_concurrent_uploads: 2
241
+ max_grad_norm: 1.0
242
+ max_retries: 3
243
+ max_steps: 100000
244
+ num_gpus: 1
245
+ open_loop_eval_plot_indices: null
246
+ open_loop_eval_steps_per_traj: 100
247
+ open_loop_eval_traj_ids:
248
+ - 0
249
+ optim: adamw_torch
250
+ output_dir: /models/ORCA-GROOT-N1.6-Sim-Pick-Place
251
+ remove_unused_columns: false
252
+ save_best_eval_metric_greater_is_better: true
253
+ save_best_eval_metric_name: ''
254
+ save_steps: 10000
255
+ save_total_limit: 9
256
+ save_vl_model: false
257
+ start_from_checkpoint: nvidia/GR00T-N1.6-3B
258
+ tf32: true
259
+ transformers_access_token: null
260
+ transformers_cache_dir: null
261
+ transformers_local_files_only: false
262
+ transformers_trust_remote_code: true
263
+ upload_checkpoints: false
264
+ upload_every: 1000
265
+ upload_last_n_checkpoints: 5
266
+ use_ddp: false
267
+ use_wandb: true
268
+ wandb_project: finetune-gr00t-n1d6
269
+ warmup_ratio: 0.05
270
+ warmup_steps: 0
271
+ weight_decay: 1.0e-05
experiment_cfg/dataset_statistics.json ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "state": {
4
+ "left_arm": {
5
+ "min": [
6
+ -1.6188877820968628,
7
+ -0.8244653344154358,
8
+ -0.6688740253448486,
9
+ -0.9937067627906799,
10
+ -1.8114137649536133,
11
+ -1.1228705644607544,
12
+ -1.6144299507141113
13
+ ],
14
+ "max": [
15
+ 0.5598819255828857,
16
+ 1.3845295906066895,
17
+ 1.3240509033203125,
18
+ 1.3527957201004028,
19
+ 1.7814620733261108,
20
+ 1.6144317388534546,
21
+ 1.1065175533294678
22
+ ],
23
+ "mean": [
24
+ -0.37701161097217084,
25
+ 0.622215387442254,
26
+ 0.37503467529814766,
27
+ 0.15783503666421436,
28
+ 0.2399788372564,
29
+ 1.1200109759703378,
30
+ -0.9174301245354659
31
+ ],
32
+ "std": [
33
+ 0.4946083775214465,
34
+ 0.29932238916855064,
35
+ 0.32772549348322094,
36
+ 0.48449540532395885,
37
+ 0.40544754602526073,
38
+ 0.3511242435683432,
39
+ 0.41043898365089426
40
+ ],
41
+ "q01": [
42
+ -1.4819707024097442,
43
+ -0.14474324360489793,
44
+ -0.574706056714058,
45
+ -0.8940612733364105,
46
+ -1.4438544452190398,
47
+ 0.14948371022939683,
48
+ -1.6144284009933472
49
+ ],
50
+ "q99": [
51
+ 0.19417367294430646,
52
+ 1.3513104689121245,
53
+ 1.0247645223140713,
54
+ 1.2698133671283716,
55
+ 1.2623580229282378,
56
+ 1.6144297122955322,
57
+ 0.6261248129606154
58
+ ]
59
+ },
60
+ "right_arm": {
61
+ "min": [
62
+ -1.5973707437515259,
63
+ -1.4186440706253052,
64
+ -1.3715554475784302,
65
+ -0.9967253804206848,
66
+ -1.9683640003204346,
67
+ -1.2039880752563477,
68
+ -0.7873280644416809
69
+ ],
70
+ "max": [
71
+ 0.6645666360855103,
72
+ 0.7108421325683594,
73
+ 0.8019299507141113,
74
+ 1.791968584060669,
75
+ 1.6332509517669678,
76
+ 1.6144312620162964,
77
+ 1.6144299507141113
78
+ ],
79
+ "mean": [
80
+ -0.35405250633789215,
81
+ -0.6411186511942882,
82
+ -0.3977095291314536,
83
+ 0.17245792957725906,
84
+ -0.23415605063914072,
85
+ 1.1111519218280617,
86
+ 0.927019808860804
87
+ ],
88
+ "std": [
89
+ 0.48128610921576437,
90
+ 0.3071013246943398,
91
+ 0.3253459083547921,
92
+ 0.4865591021694431,
93
+ 0.40655478518574584,
94
+ 0.35977562430478466,
95
+ 0.39641489447232386
96
+ ],
97
+ "q01": [
98
+ -1.4656522178649902,
99
+ -1.39724303483963,
100
+ -1.0831108903884887,
101
+ -0.930193657875061,
102
+ -1.2693367302417755,
103
+ 0.10924758933484555,
104
+ -0.5172812539339066
105
+ ],
106
+ "q99": [
107
+ 0.20503960207104668,
108
+ 0.16118972286581973,
109
+ 0.5304996186494825,
110
+ 1.3874476826190947,
111
+ 1.3712920653820024,
112
+ 1.6144297122955322,
113
+ 1.611855911016464
114
+ ]
115
+ },
116
+ "left_hand": {
117
+ "min": [
118
+ -0.6636313796043396,
119
+ -1.2668558359146118,
120
+ -0.6931474208831787,
121
+ -1.238078236579895,
122
+ -0.15442876517772675,
123
+ -0.024228721857070923,
124
+ -1.3946487342764158e-07
125
+ ],
126
+ "max": [
127
+ 1.4664448144685593e-07,
128
+ 3.774755796825957e-08,
129
+ 2.3912218239274807e-05,
130
+ 1.4326724340207875e-05,
131
+ 0.06299737840890884,
132
+ 0.7382361888885498,
133
+ 0.8579182028770447
134
+ ],
135
+ "mean": [
136
+ -0.32608566497335373,
137
+ -0.7007213049377038,
138
+ -0.3091359140462433,
139
+ -0.7064073512096278,
140
+ -0.0007393016489160969,
141
+ 0.38868270547974193,
142
+ 0.40552368424586116
143
+ ],
144
+ "std": [
145
+ 0.27386463929352794,
146
+ 0.5799191449185522,
147
+ 0.25783882398027064,
148
+ 0.5828784381585906,
149
+ 0.010326922473384853,
150
+ 0.32419180806582787,
151
+ 0.33414980089677404
152
+ ],
153
+ "q01": [
154
+ -0.6073416697978974,
155
+ -1.2136946415901184,
156
+ -0.5961684477329254,
157
+ -1.2116613757610322,
158
+ -0.06334078542888165,
159
+ -0.002377869929186999,
160
+ 5.822703361135773e-11
161
+ ],
162
+ "q99": [
163
+ 5.486197252047003e-10,
164
+ 2.372745946943576e-10,
165
+ 7.51225351369729e-10,
166
+ 2.186030552409053e-10,
167
+ 0.03812098965048779,
168
+ 0.7133938479423522,
169
+ 0.7890969663858408
170
+ ]
171
+ },
172
+ "right_hand": {
173
+ "min": [
174
+ -1.1264450705539275e-07,
175
+ -4.103394246612879e-07,
176
+ -2.3143680664361455e-05,
177
+ -2.2077354515204206e-05,
178
+ -0.11818881332874298,
179
+ -0.7287072539329529,
180
+ -0.8129876852035522
181
+ ],
182
+ "max": [
183
+ 0.6552737355232239,
184
+ 1.2486000061035156,
185
+ 0.6695961356163025,
186
+ 1.2865338325500488,
187
+ 0.07462365180253983,
188
+ 0.0623926967382431,
189
+ 1.360115504667192e-07
190
+ ],
191
+ "mean": [
192
+ 0.32603508115604224,
193
+ 0.7019679585829476,
194
+ 0.305846294819914,
195
+ 0.7061730283775077,
196
+ 0.0009488441506462382,
197
+ -0.38256330225641366,
198
+ -0.4028953769743837
199
+ ],
200
+ "std": [
201
+ 0.27344637434110475,
202
+ 0.5808696397330089,
203
+ 0.25478759263415374,
204
+ 0.581769250960524,
205
+ 0.009655141392575449,
206
+ 0.32200539057458083,
207
+ 0.33302532344362673
208
+ ],
209
+ "q01": [
210
+ -7.135985397033196e-10,
211
+ -2.377115057572432e-10,
212
+ -8.369249776540855e-10,
213
+ -1.8977105553652506e-10,
214
+ -0.04443687982857227,
215
+ -0.7134130877256394,
216
+ -0.7536908400058746
217
+ ],
218
+ "q99": [
219
+ 0.631313579082489,
220
+ 1.2208076870441436,
221
+ 0.5956605392694473,
222
+ 1.2157493793964385,
223
+ 0.05435568977147335,
224
+ 0.002578642389271219,
225
+ 0.0
226
+ ]
227
+ },
228
+ "waist": {
229
+ "min": [
230
+ -0.08484945446252823,
231
+ -0.17682865262031555,
232
+ -0.04478275775909424
233
+ ],
234
+ "max": [
235
+ 0.06575624644756317,
236
+ 0.15222139656543732,
237
+ 0.21880359947681427
238
+ ],
239
+ "mean": [
240
+ 0.0006298807157444096,
241
+ 0.004861228026153632,
242
+ 0.07411838826162137
243
+ ],
244
+ "std": [
245
+ 0.01696499541536457,
246
+ 0.054129893445569476,
247
+ 0.021960865412836504
248
+ ],
249
+ "q01": [
250
+ -0.03211269959807396,
251
+ -0.10020484030246735,
252
+ 0.019505513962358237
253
+ ],
254
+ "q99": [
255
+ 0.03720596775412556,
256
+ 0.12259715944528575,
257
+ 0.16097534537315367
258
+ ]
259
+ }
260
+ },
261
+ "action": {
262
+ "left_arm": {
263
+ "min": [
264
+ -1.7263685464859009,
265
+ -0.8693848848342896,
266
+ -0.6196113228797913,
267
+ -1.047199010848999,
268
+ -1.961471676826477,
269
+ -1.1776831150054932,
270
+ -1.6144285202026367
271
+ ],
272
+ "max": [
273
+ 0.5359810590744019,
274
+ 1.4409669637680054,
275
+ 1.2984442710876465,
276
+ 1.2675622701644897,
277
+ 1.7942465543746948,
278
+ 1.6144285202026367,
279
+ 0.9927496314048767
280
+ ],
281
+ "mean": [
282
+ -0.4278667482319257,
283
+ 0.649818023309013,
284
+ 0.3946276520261701,
285
+ 0.07472400054249247,
286
+ 0.20272922651558525,
287
+ 1.0919044724363365,
288
+ -0.9606466530174609
289
+ ],
290
+ "std": [
291
+ 0.5163644120141647,
292
+ 0.31051259045659124,
293
+ 0.3232428585094077,
294
+ 0.4900628976458507,
295
+ 0.40242868284550015,
296
+ 0.36495802692479223,
297
+ 0.416769561480303
298
+ ],
299
+ "q01": [
300
+ -1.5964591109752655,
301
+ -0.23531204849481582,
302
+ -0.5155686557292938,
303
+ -0.9812510508298874,
304
+ -1.6113185107707977,
305
+ 0.096123421266675,
306
+ -1.6144285202026367
307
+ ],
308
+ "q99": [
309
+ 0.17772121801972385,
310
+ 1.407341595888138,
311
+ 1.0876906502246857,
312
+ 1.1679468894004816,
313
+ 1.134265422821045,
314
+ 1.6144285202026367,
315
+ 0.5308546763658487
316
+ ]
317
+ },
318
+ "right_arm": {
319
+ "min": [
320
+ -1.6998568773269653,
321
+ -1.483291745185852,
322
+ -1.443288803100586,
323
+ -1.047199010848999,
324
+ -1.9721182584762573,
325
+ -1.2583476305007935,
326
+ -0.9640278816223145
327
+ ],
328
+ "max": [
329
+ 0.6452722549438477,
330
+ 0.7532204389572144,
331
+ 0.6300758123397827,
332
+ 1.707554578781128,
333
+ 1.775823950767517,
334
+ 1.6144285202026367,
335
+ 1.6144285202026367
336
+ ],
337
+ "mean": [
338
+ -0.4050310619224775,
339
+ -0.6717719246220115,
340
+ -0.41923698447398006,
341
+ 0.08687886090591451,
342
+ -0.19810921054592492,
343
+ 1.0821888584964323,
344
+ 0.9696818246746695
345
+ ],
346
+ "std": [
347
+ 0.5025011695925508,
348
+ 0.3186337523657578,
349
+ 0.32164503326881233,
350
+ 0.49357459211424437,
351
+ 0.4039177868508143,
352
+ 0.3748570877458252,
353
+ 0.4044295671498674
354
+ ],
355
+ "q01": [
356
+ -1.576135537624359,
357
+ -1.4606408774852753,
358
+ -1.154829856157303,
359
+ -1.0447131776809693,
360
+ -1.162146143913269,
361
+ 0.031073938850313426,
362
+ -0.4146004369854927
363
+ ],
364
+ "q99": [
365
+ 0.1896919891238198,
366
+ 0.1906195104122157,
367
+ 0.47255201905965794,
368
+ 1.3261596608161925,
369
+ 1.5163511252403197,
370
+ 1.6144285202026367,
371
+ 1.6144285202026367
372
+ ]
373
+ },
374
+ "left_hand": {
375
+ "min": [
376
+ -0.6000000238418579,
377
+ -1.2000000476837158,
378
+ -0.6000000238418579,
379
+ -1.2000000476837158,
380
+ 0.0,
381
+ 0.0,
382
+ 0.0
383
+ ],
384
+ "max": [
385
+ 0.0,
386
+ 0.0,
387
+ 0.0,
388
+ 0.0,
389
+ 0.0,
390
+ 0.699999988079071,
391
+ 0.699999988079071
392
+ ],
393
+ "mean": [
394
+ -0.3543432927289546,
395
+ -0.7086865854579092,
396
+ -0.3543432927289546,
397
+ -0.7086865854579092,
398
+ 0.0,
399
+ 0.41355503927792936,
400
+ 0.41355503927792936
401
+ ],
402
+ "std": [
403
+ 0.29501940357613954,
404
+ 0.5900388071522791,
405
+ 0.29501940357613954,
406
+ 0.5900388071522791,
407
+ 0.0,
408
+ 0.34418538597523085,
409
+ 0.34418538597523085
410
+ ],
411
+ "q01": [
412
+ -0.6000000238418579,
413
+ -1.2000000476837158,
414
+ -0.6000000238418579,
415
+ -1.2000000476837158,
416
+ 0.0,
417
+ 0.0,
418
+ 0.0
419
+ ],
420
+ "q99": [
421
+ 0.0,
422
+ 0.0,
423
+ 0.0,
424
+ 0.0,
425
+ 0.0,
426
+ 0.699999988079071,
427
+ 0.699999988079071
428
+ ]
429
+ },
430
+ "right_hand": {
431
+ "min": [
432
+ -0.0,
433
+ -0.0,
434
+ -0.0,
435
+ -0.0,
436
+ -0.0,
437
+ -0.699999988079071,
438
+ -0.699999988079071
439
+ ],
440
+ "max": [
441
+ 0.6000000238418579,
442
+ 1.2000000476837158,
443
+ 0.6000000238418579,
444
+ 1.2000000476837158,
445
+ -0.0,
446
+ -0.0,
447
+ -0.0
448
+ ],
449
+ "mean": [
450
+ 0.3543432927289546,
451
+ 0.7086865854579092,
452
+ 0.3543432927289546,
453
+ 0.7086865854579092,
454
+ 0.0,
455
+ -0.41355503927792936,
456
+ -0.41355503927792936
457
+ ],
458
+ "std": [
459
+ 0.29501940357613954,
460
+ 0.5900388071522791,
461
+ 0.29501940357613954,
462
+ 0.5900388071522791,
463
+ 0.0,
464
+ 0.34418538597523085,
465
+ 0.34418538597523085
466
+ ],
467
+ "q01": [
468
+ 0.0,
469
+ 0.0,
470
+ 0.0,
471
+ 0.0,
472
+ 0.0,
473
+ -0.699999988079071,
474
+ -0.699999988079071
475
+ ],
476
+ "q99": [
477
+ 0.6000000238418579,
478
+ 1.2000000476837158,
479
+ 0.6000000238418579,
480
+ 1.2000000476837158,
481
+ -0.0,
482
+ -0.0,
483
+ -0.0
484
+ ]
485
+ },
486
+ "base_height_command": {
487
+ "min": [
488
+ 0.75
489
+ ],
490
+ "max": [
491
+ 0.75
492
+ ],
493
+ "mean": [
494
+ 0.75
495
+ ],
496
+ "std": [
497
+ 0.0
498
+ ],
499
+ "q01": [
500
+ 0.75
501
+ ],
502
+ "q99": [
503
+ 0.75
504
+ ]
505
+ },
506
+ "navigate_command": {
507
+ "min": [
508
+ -0.20000000298023224,
509
+ 0.0,
510
+ -0.6000000238418579
511
+ ],
512
+ "max": [
513
+ 0.20000000298023224,
514
+ 0.20000000298023224,
515
+ 0.20000000298023224
516
+ ],
517
+ "mean": [
518
+ 0.06668563140247832,
519
+ 0.004235940249883477,
520
+ -0.11014999049586177
521
+ ],
522
+ "std": [
523
+ 0.12825434879142483,
524
+ 0.02879522156961949,
525
+ 0.22963595813044332
526
+ ],
527
+ "q01": [
528
+ -0.20000000298023224,
529
+ 0.0,
530
+ -0.6000000238418579
531
+ ],
532
+ "q99": [
533
+ 0.20000000298023224,
534
+ 0.20000000298023224,
535
+ 0.20000000298023224
536
+ ]
537
+ }
538
+ },
539
+ "relative_action": {}
540
+ }
541
+ }
experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d6",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
+ "backbone_model_type": "eagle",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 4,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": true,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": true,
15
+ "collator_overwrite_image_inputs": false,
16
+ "eagle_collator": true,
17
+ "backbone_trainable_params_fp32": true,
18
+ "apply_sincos_state_encoding": true,
19
+ "use_relative_action": true,
20
+ "max_state_dim": 128,
21
+ "max_action_dim": 128,
22
+ "action_horizon": 50,
23
+ "hidden_size": 1024,
24
+ "input_embedding_dim": 1536,
25
+ "add_pos_embed": true,
26
+ "attn_dropout": 0.2,
27
+ "use_vlln": true,
28
+ "max_seq_len": 1024,
29
+ "use_alternate_vl_dit": true,
30
+ "attend_text_every_n_blocks": 2,
31
+ "diffusion_model_cfg": {
32
+ "attention_head_dim": 48,
33
+ "dropout": 0.2,
34
+ "final_dropout": true,
35
+ "interleave_self_attention": true,
36
+ "norm_type": "ada_norm",
37
+ "num_attention_heads": 32,
38
+ "num_layers": 32,
39
+ "output_dim": 1024,
40
+ "positional_embeddings": null
41
+ },
42
+ "num_inference_timesteps": 4,
43
+ "noise_beta_alpha": 1.5,
44
+ "noise_beta_beta": 1.0,
45
+ "noise_s": 0.999,
46
+ "num_timestep_buckets": 1000,
47
+ "tune_projector": true,
48
+ "tune_diffusion_model": true,
49
+ "tune_vlln": true,
50
+ "state_dropout_prob": 0.0,
51
+ "state_additive_noise_scale": 0.0,
52
+ "max_num_embodiments": 32
53
+ }
experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf6efad6efbd23e182905c1a5526c725e275b47202e192631042bf3a6966667c
3
+ size 4966860224
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8871ac4cee155d3d4e135f62c4d4873ec7804c7c9add671f7203a151322e913
3
+ size 4665286304
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcdf4f7bea965b127eadeccedd1482384b4a623819f0703c6afe4b6d360e9445
3
+ size 1063814336
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
processor_config.json ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "action_configs": null
18
+ },
19
+ "state": {
20
+ "delta_indices": [
21
+ 0
22
+ ],
23
+ "modality_keys": [
24
+ "robot_pos",
25
+ "robot_ori_cos",
26
+ "robot_ori_sin",
27
+ "robot_2d_ori",
28
+ "robot_2d_ori_cos",
29
+ "robot_2d_ori_sin",
30
+ "robot_lin_vel",
31
+ "robot_ang_vel",
32
+ "arm_left_qpos",
33
+ "arm_left_qpos_sin",
34
+ "arm_left_qpos_cos",
35
+ "eef_left_pos",
36
+ "eef_left_quat",
37
+ "gripper_left_qpos",
38
+ "arm_right_qpos",
39
+ "arm_right_qpos_sin",
40
+ "arm_right_qpos_cos",
41
+ "eef_right_pos",
42
+ "eef_right_quat",
43
+ "gripper_right_qpos",
44
+ "trunk_qpos"
45
+ ],
46
+ "sin_cos_embedding_keys": null,
47
+ "mean_std_embedding_keys": null,
48
+ "action_configs": null
49
+ },
50
+ "action": {
51
+ "delta_indices": [
52
+ 0,
53
+ 1,
54
+ 2,
55
+ 3,
56
+ 4,
57
+ 5,
58
+ 6,
59
+ 7,
60
+ 8,
61
+ 9,
62
+ 10,
63
+ 11,
64
+ 12,
65
+ 13,
66
+ 14,
67
+ 15,
68
+ 16,
69
+ 17,
70
+ 18,
71
+ 19,
72
+ 20,
73
+ 21,
74
+ 22,
75
+ 23,
76
+ 24,
77
+ 25,
78
+ 26,
79
+ 27,
80
+ 28,
81
+ 29,
82
+ 30,
83
+ 31
84
+ ],
85
+ "modality_keys": [
86
+ "base",
87
+ "torso",
88
+ "left_arm",
89
+ "left_gripper",
90
+ "right_arm",
91
+ "right_gripper"
92
+ ],
93
+ "sin_cos_embedding_keys": null,
94
+ "mean_std_embedding_keys": null,
95
+ "action_configs": [
96
+ {
97
+ "rep": "ABSOLUTE",
98
+ "type": "NON_EEF",
99
+ "format": "DEFAULT",
100
+ "state_key": null
101
+ },
102
+ {
103
+ "rep": "RELATIVE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": "trunk_qpos"
107
+ },
108
+ {
109
+ "rep": "RELATIVE",
110
+ "type": "NON_EEF",
111
+ "format": "DEFAULT",
112
+ "state_key": "arm_left_qpos"
113
+ },
114
+ {
115
+ "rep": "ABSOLUTE",
116
+ "type": "NON_EEF",
117
+ "format": "DEFAULT",
118
+ "state_key": null
119
+ },
120
+ {
121
+ "rep": "RELATIVE",
122
+ "type": "NON_EEF",
123
+ "format": "DEFAULT",
124
+ "state_key": "arm_right_qpos"
125
+ },
126
+ {
127
+ "rep": "ABSOLUTE",
128
+ "type": "NON_EEF",
129
+ "format": "DEFAULT",
130
+ "state_key": null
131
+ }
132
+ ]
133
+ },
134
+ "language": {
135
+ "delta_indices": [
136
+ 0
137
+ ],
138
+ "modality_keys": [
139
+ "annotation.human.coarse_action"
140
+ ],
141
+ "sin_cos_embedding_keys": null,
142
+ "mean_std_embedding_keys": null,
143
+ "action_configs": null
144
+ }
145
+ },
146
+ "gr1": {
147
+ "video": {
148
+ "delta_indices": [
149
+ 0
150
+ ],
151
+ "modality_keys": [
152
+ "ego_view_bg_crop_pad_res256_freq20"
153
+ ],
154
+ "sin_cos_embedding_keys": null,
155
+ "mean_std_embedding_keys": null,
156
+ "action_configs": null
157
+ },
158
+ "state": {
159
+ "delta_indices": [
160
+ 0
161
+ ],
162
+ "modality_keys": [
163
+ "left_arm",
164
+ "right_arm",
165
+ "left_hand",
166
+ "right_hand",
167
+ "waist"
168
+ ],
169
+ "sin_cos_embedding_keys": [
170
+ "left_arm",
171
+ "right_arm",
172
+ "left_hand",
173
+ "right_hand",
174
+ "waist"
175
+ ],
176
+ "mean_std_embedding_keys": null,
177
+ "action_configs": null
178
+ },
179
+ "action": {
180
+ "delta_indices": [
181
+ 0,
182
+ 1,
183
+ 2,
184
+ 3,
185
+ 4,
186
+ 5,
187
+ 6,
188
+ 7,
189
+ 8,
190
+ 9,
191
+ 10,
192
+ 11,
193
+ 12,
194
+ 13,
195
+ 14,
196
+ 15
197
+ ],
198
+ "modality_keys": [
199
+ "left_arm",
200
+ "right_arm",
201
+ "left_hand",
202
+ "right_hand",
203
+ "waist"
204
+ ],
205
+ "sin_cos_embedding_keys": null,
206
+ "mean_std_embedding_keys": null,
207
+ "action_configs": [
208
+ {
209
+ "rep": "RELATIVE",
210
+ "type": "NON_EEF",
211
+ "format": "DEFAULT",
212
+ "state_key": null
213
+ },
214
+ {
215
+ "rep": "RELATIVE",
216
+ "type": "NON_EEF",
217
+ "format": "DEFAULT",
218
+ "state_key": null
219
+ },
220
+ {
221
+ "rep": "RELATIVE",
222
+ "type": "NON_EEF",
223
+ "format": "DEFAULT",
224
+ "state_key": null
225
+ },
226
+ {
227
+ "rep": "RELATIVE",
228
+ "type": "NON_EEF",
229
+ "format": "DEFAULT",
230
+ "state_key": null
231
+ },
232
+ {
233
+ "rep": "ABSOLUTE",
234
+ "type": "NON_EEF",
235
+ "format": "DEFAULT",
236
+ "state_key": null
237
+ }
238
+ ]
239
+ },
240
+ "language": {
241
+ "delta_indices": [
242
+ 0
243
+ ],
244
+ "modality_keys": [
245
+ "task"
246
+ ],
247
+ "sin_cos_embedding_keys": null,
248
+ "mean_std_embedding_keys": null,
249
+ "action_configs": null
250
+ }
251
+ },
252
+ "robocasa_panda_omron": {
253
+ "video": {
254
+ "delta_indices": [
255
+ 0
256
+ ],
257
+ "modality_keys": [
258
+ "res256_image_side_0",
259
+ "res256_image_side_1",
260
+ "res256_image_wrist_0"
261
+ ],
262
+ "sin_cos_embedding_keys": null,
263
+ "mean_std_embedding_keys": null,
264
+ "action_configs": null
265
+ },
266
+ "state": {
267
+ "delta_indices": [
268
+ 0
269
+ ],
270
+ "modality_keys": [
271
+ "end_effector_position_relative",
272
+ "end_effector_rotation_relative",
273
+ "gripper_qpos",
274
+ "base_position",
275
+ "base_rotation"
276
+ ],
277
+ "sin_cos_embedding_keys": null,
278
+ "mean_std_embedding_keys": null,
279
+ "action_configs": null
280
+ },
281
+ "action": {
282
+ "delta_indices": [
283
+ 0,
284
+ 1,
285
+ 2,
286
+ 3,
287
+ 4,
288
+ 5,
289
+ 6,
290
+ 7,
291
+ 8,
292
+ 9,
293
+ 10,
294
+ 11,
295
+ 12,
296
+ 13,
297
+ 14,
298
+ 15
299
+ ],
300
+ "modality_keys": [
301
+ "end_effector_position",
302
+ "end_effector_rotation",
303
+ "gripper_close",
304
+ "base_motion",
305
+ "control_mode"
306
+ ],
307
+ "sin_cos_embedding_keys": null,
308
+ "mean_std_embedding_keys": null,
309
+ "action_configs": [
310
+ {
311
+ "rep": "ABSOLUTE",
312
+ "type": "NON_EEF",
313
+ "format": "DEFAULT",
314
+ "state_key": null
315
+ },
316
+ {
317
+ "rep": "ABSOLUTE",
318
+ "type": "NON_EEF",
319
+ "format": "DEFAULT",
320
+ "state_key": null
321
+ },
322
+ {
323
+ "rep": "ABSOLUTE",
324
+ "type": "NON_EEF",
325
+ "format": "DEFAULT",
326
+ "state_key": null
327
+ },
328
+ {
329
+ "rep": "ABSOLUTE",
330
+ "type": "NON_EEF",
331
+ "format": "DEFAULT",
332
+ "state_key": null
333
+ },
334
+ {
335
+ "rep": "ABSOLUTE",
336
+ "type": "NON_EEF",
337
+ "format": "DEFAULT",
338
+ "state_key": null
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "annotation.human.action.task_description"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "action_configs": null
352
+ }
353
+ },
354
+ "new_embodiment": {
355
+ "video": {
356
+ "delta_indices": [
357
+ 0
358
+ ],
359
+ "modality_keys": [
360
+ "ego_view"
361
+ ],
362
+ "sin_cos_embedding_keys": null,
363
+ "mean_std_embedding_keys": null,
364
+ "action_configs": null
365
+ },
366
+ "state": {
367
+ "delta_indices": [
368
+ 0
369
+ ],
370
+ "modality_keys": [
371
+ "left_arm",
372
+ "right_arm",
373
+ "left_hand",
374
+ "right_hand",
375
+ "waist"
376
+ ],
377
+ "sin_cos_embedding_keys": null,
378
+ "mean_std_embedding_keys": null,
379
+ "action_configs": null
380
+ },
381
+ "action": {
382
+ "delta_indices": [
383
+ 0,
384
+ 1,
385
+ 2,
386
+ 3,
387
+ 4,
388
+ 5,
389
+ 6,
390
+ 7,
391
+ 8,
392
+ 9,
393
+ 10,
394
+ 11,
395
+ 12,
396
+ 13,
397
+ 14,
398
+ 15
399
+ ],
400
+ "modality_keys": [
401
+ "left_arm",
402
+ "right_arm",
403
+ "left_hand",
404
+ "right_hand",
405
+ "base_height_command",
406
+ "navigate_command"
407
+ ],
408
+ "sin_cos_embedding_keys": null,
409
+ "mean_std_embedding_keys": null,
410
+ "action_configs": [
411
+ {
412
+ "rep": "ABSOLUTE",
413
+ "type": "NON_EEF",
414
+ "format": "DEFAULT",
415
+ "state_key": null
416
+ },
417
+ {
418
+ "rep": "ABSOLUTE",
419
+ "type": "NON_EEF",
420
+ "format": "DEFAULT",
421
+ "state_key": null
422
+ },
423
+ {
424
+ "rep": "ABSOLUTE",
425
+ "type": "NON_EEF",
426
+ "format": "DEFAULT",
427
+ "state_key": null
428
+ },
429
+ {
430
+ "rep": "ABSOLUTE",
431
+ "type": "NON_EEF",
432
+ "format": "DEFAULT",
433
+ "state_key": null
434
+ },
435
+ {
436
+ "rep": "ABSOLUTE",
437
+ "type": "NON_EEF",
438
+ "format": "DEFAULT",
439
+ "state_key": null
440
+ },
441
+ {
442
+ "rep": "ABSOLUTE",
443
+ "type": "NON_EEF",
444
+ "format": "DEFAULT",
445
+ "state_key": null
446
+ }
447
+ ]
448
+ },
449
+ "language": {
450
+ "delta_indices": [
451
+ 0
452
+ ],
453
+ "modality_keys": [
454
+ "annotation.human.task_description"
455
+ ],
456
+ "sin_cos_embedding_keys": null,
457
+ "mean_std_embedding_keys": null,
458
+ "action_configs": null
459
+ }
460
+ }
461
+ },
462
+ "image_crop_size": null,
463
+ "image_target_size": null,
464
+ "use_albumentations": true,
465
+ "random_rotation_angle": null,
466
+ "color_jitter_params": {
467
+ "brightness": 0.3,
468
+ "contrast": 0.4,
469
+ "saturation": 0.5,
470
+ "hue": 0.08
471
+ },
472
+ "shortest_image_edge": 256,
473
+ "crop_fraction": 0.95,
474
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
475
+ "model_type": "eagle",
476
+ "formalize_language": true,
477
+ "max_state_dim": 128,
478
+ "max_action_dim": 128,
479
+ "max_action_horizon": 50,
480
+ "use_percentiles": false,
481
+ "clip_outliers": true,
482
+ "apply_sincos_state_encoding": true,
483
+ "use_relative_action": true
484
+ }
485
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6737d5c0bb1d934959fda29ae9639c3bb8ddb9740a0b198781e7df6e43e27573
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69a58e80395323ef08a4ef94d63e83fe07ffedb4206d41667d189a626475829f
3
+ size 1465
statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:394186849149d5a727bef04d4ee20b7feaab35828bdffa3a536f4f73552a931a
3
+ size 5777
wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d6", "run_id": "ORCA-GROOT-N1.6-Sim-Pick-Place"}