Dongkkka commited on
Commit
a2195f6
·
verified ·
1 Parent(s): 8281e99

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": true,
5
+ "architectures": [
6
+ "Gr00tN1d6"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_model_type": "eagle",
12
+ "backbone_trainable_params_fp32": true,
13
+ "collator_overwrite_image_inputs": false,
14
+ "color_jitter_params": {
15
+ "brightness": 0.1,
16
+ "contrast": 0.1,
17
+ "hue": 0.1,
18
+ "saturation": 0.1
19
+ },
20
+ "crop_fraction": 0.95,
21
+ "diffusion_model_cfg": {
22
+ "attention_head_dim": 48,
23
+ "dropout": 0.2,
24
+ "final_dropout": true,
25
+ "interleave_self_attention": true,
26
+ "norm_type": "ada_norm",
27
+ "num_attention_heads": 32,
28
+ "num_layers": 32,
29
+ "output_dim": 1024,
30
+ "positional_embeddings": null
31
+ },
32
+ "eagle_collator": true,
33
+ "formalize_language": true,
34
+ "gemma_collator": false,
35
+ "hidden_size": 1024,
36
+ "image_crop_size": null,
37
+ "image_target_size": null,
38
+ "input_embedding_dim": 1536,
39
+ "load_bf16": true,
40
+ "max_action_dim": 128,
41
+ "max_num_embodiments": 32,
42
+ "max_seq_len": 1024,
43
+ "max_state_dim": 128,
44
+ "model_dtype": "bfloat16",
45
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
+ "model_type": "Gr00tN1d6",
47
+ "noise_beta_alpha": 1.5,
48
+ "noise_beta_beta": 1.0,
49
+ "noise_s": 0.999,
50
+ "num_inference_timesteps": 4,
51
+ "num_timestep_buckets": 1000,
52
+ "random_rotation_angle": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 16,
55
+ "shortest_image_edge": 256,
56
+ "state_dropout_prob": 0.0,
57
+ "torch_dtype": "bfloat16",
58
+ "transformers_version": "4.51.3",
59
+ "tune_diffusion_model": true,
60
+ "tune_llm": false,
61
+ "tune_projector": true,
62
+ "tune_top_llm_layers": 4,
63
+ "tune_visual": false,
64
+ "tune_vlln": true,
65
+ "use_albumentations_transforms": true,
66
+ "use_alternate_vl_dit": true,
67
+ "use_flash_attention": true,
68
+ "use_relative_action": true,
69
+ "use_vlln": true
70
+ }
embodiment_id.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "oxe_droid": 16,
10
+ "new_embodiment": 10
11
+ }
experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d6
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Eagle-Block2A-2B-v2
6
+ backbone_model_type: eagle
7
+ model_revision: null
8
+ tune_top_llm_layers: 4
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 16
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ collator_overwrite_image_inputs: false
17
+ eagle_collator: true
18
+ backbone_trainable_params_fp32: true
19
+ image_crop_size: null
20
+ image_target_size: null
21
+ shortest_image_edge: 256
22
+ crop_fraction: 0.95
23
+ random_rotation_angle: null
24
+ color_jitter_params:
25
+ brightness: 0.3
26
+ contrast: 0.4
27
+ saturation: 0.5
28
+ hue: 0.08
29
+ use_albumentations_transforms: true
30
+ extra_augmentation_config: null
31
+ formalize_language: true
32
+ apply_sincos_state_encoding: false
33
+ use_relative_action: true
34
+ max_state_dim: 29
35
+ max_action_dim: 29
36
+ action_horizon: 16
37
+ hidden_size: 1024
38
+ input_embedding_dim: 1536
39
+ add_pos_embed: true
40
+ attn_dropout: 0.2
41
+ use_vlln: true
42
+ max_seq_len: 1024
43
+ use_alternate_vl_dit: true
44
+ attend_text_every_n_blocks: 2
45
+ diffusion_model_cfg:
46
+ positional_embeddings: null
47
+ num_layers: 32
48
+ num_attention_heads: 32
49
+ attention_head_dim: 48
50
+ norm_type: ada_norm
51
+ dropout: 0.2
52
+ final_dropout: true
53
+ output_dim: 1024
54
+ interleave_self_attention: true
55
+ num_inference_timesteps: 4
56
+ noise_beta_alpha: 1.5
57
+ noise_beta_beta: 1.0
58
+ noise_s: 0.999
59
+ num_timestep_buckets: 1000
60
+ tune_projector: true
61
+ tune_diffusion_model: true
62
+ tune_vlln: true
63
+ state_dropout_prob: 0.0
64
+ state_additive_noise_scale: 0.0
65
+ max_num_embodiments: 32
66
+ data:
67
+ datasets:
68
+ - dataset_paths:
69
+ - /data/datasets/Dongkkka/task_513_head_track_lerobot
70
+ embodiment_tag: new_embodiment
71
+ mix_ratio: 1.0
72
+ dataset_type: physical_embodiment
73
+ val_dataset_path: null
74
+ modality_configs:
75
+ new_embodiment:
76
+ video:
77
+ delta_indices:
78
+ - 0
79
+ modality_keys:
80
+ - cam_left_head
81
+ sin_cos_embedding_keys: null
82
+ mean_std_embedding_keys: null
83
+ action_configs: null
84
+ state:
85
+ delta_indices:
86
+ - 0
87
+ modality_keys:
88
+ - arm_left
89
+ - arm_right
90
+ - head
91
+ sin_cos_embedding_keys: null
92
+ mean_std_embedding_keys: null
93
+ action_configs: null
94
+ action:
95
+ delta_indices:
96
+ - 0
97
+ - 1
98
+ - 2
99
+ - 3
100
+ - 4
101
+ - 5
102
+ - 6
103
+ - 7
104
+ - 8
105
+ - 9
106
+ - 10
107
+ - 11
108
+ - 12
109
+ - 13
110
+ - 14
111
+ - 15
112
+ modality_keys:
113
+ - arm_left
114
+ - arm_right
115
+ - head
116
+ sin_cos_embedding_keys: null
117
+ mean_std_embedding_keys: null
118
+ action_configs:
119
+ - rep: ABSOLUTE
120
+ type: NON_EEF
121
+ format: DEFAULT
122
+ state_key: null
123
+ - rep: ABSOLUTE
124
+ type: NON_EEF
125
+ format: DEFAULT
126
+ state_key: null
127
+ - rep: ABSOLUTE
128
+ type: NON_EEF
129
+ format: DEFAULT
130
+ state_key: null
131
+ language:
132
+ delta_indices:
133
+ - 0
134
+ modality_keys:
135
+ - annotation.human.task_description
136
+ sin_cos_embedding_keys: null
137
+ mean_std_embedding_keys: null
138
+ action_configs: null
139
+ download_cache: false
140
+ shard_size: 1024
141
+ episode_sampling_rate: 0.1
142
+ num_shards_per_epoch: 100000
143
+ override_pretraining_statistics: false
144
+ mode: single_turn
145
+ random_chop: 0.0
146
+ mock_dataset_mode: false
147
+ shuffle: true
148
+ seed: 42
149
+ multiprocessing_context: fork
150
+ allow_padding: false
151
+ subsample_ratio: 1.0
152
+ image_crop_size:
153
+ - 244
154
+ - 244
155
+ image_target_size:
156
+ - 224
157
+ - 224
158
+ video_backend: torchcodec
159
+ training:
160
+ output_dir: /data/checkpoints/task_513_head_track
161
+ experiment_name: null
162
+ max_steps: 80000
163
+ global_batch_size: 48
164
+ batch_size: null
165
+ gradient_accumulation_steps: 1
166
+ learning_rate: 0.0001
167
+ lr_scheduler_type: cosine
168
+ weight_decay: 1.0e-05
169
+ warmup_ratio: 0.05
170
+ warmup_steps: 0
171
+ max_grad_norm: 1.0
172
+ optim: adamw_torch
173
+ start_from_checkpoint: nvidia/GR00T-N1.6-3B
174
+ tf32: true
175
+ fp16: false
176
+ bf16: true
177
+ eval_bf16: true
178
+ logging_steps: 10
179
+ save_steps: 20000
180
+ save_total_limit: 10
181
+ save_vl_model: false
182
+ upload_checkpoints: false
183
+ upload_every: 1000
184
+ upload_last_n_checkpoints: 5
185
+ max_concurrent_uploads: 2
186
+ eval_strategy: 'no'
187
+ eval_steps: 500
188
+ eval_set_split_ratio: 0.1
189
+ eval_batch_size: 2
190
+ save_best_eval_metric_name: ''
191
+ save_best_eval_metric_greater_is_better: true
192
+ deepspeed_stage: 2
193
+ gradient_checkpointing: false
194
+ transformers_trust_remote_code: true
195
+ transformers_local_files_only: false
196
+ transformers_cache_dir: null
197
+ transformers_access_token: null
198
+ use_ddp: false
199
+ ddp_bucket_cap_mb: 100
200
+ num_gpus: 1
201
+ dataloader_num_workers: 8
202
+ remove_unused_columns: false
203
+ use_wandb: false
204
+ wandb_project: finetune-gr00t-n1d6
205
+ enable_profiling: false
206
+ max_retries: 3
207
+ assert_loss_less_than: null
208
+ add_rl_callback: false
209
+ enable_open_loop_eval: false
210
+ open_loop_eval_traj_ids:
211
+ - 0
212
+ open_loop_eval_steps_per_traj: 100
213
+ open_loop_eval_plot_indices: null
214
+ max_steps: 80000
215
+ save_steps: 20000
experiment_cfg/config.yaml ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - /data/datasets/Dongkkka/task_513_head_track_lerobot
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: new_embodiment
10
+ mix_ratio: 1.0
11
+ val_dataset_path: null
12
+ download_cache: false
13
+ episode_sampling_rate: 0.1
14
+ image_crop_size:
15
+ - 244
16
+ - 244
17
+ image_target_size:
18
+ - 224
19
+ - 224
20
+ mock_dataset_mode: false
21
+ modality_configs:
22
+ new_embodiment:
23
+ action: !!python/object:gr00t.data.types.ModalityConfig
24
+ action_configs:
25
+ - !!python/object:gr00t.data.types.ActionConfig
26
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
27
+ - default
28
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
29
+ - absolute
30
+ state_key: null
31
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
32
+ - non_eef
33
+ - !!python/object:gr00t.data.types.ActionConfig
34
+ format: *id001
35
+ rep: *id002
36
+ state_key: null
37
+ type: *id003
38
+ - !!python/object:gr00t.data.types.ActionConfig
39
+ format: *id001
40
+ rep: *id002
41
+ state_key: null
42
+ type: *id003
43
+ delta_indices:
44
+ - 0
45
+ - 1
46
+ - 2
47
+ - 3
48
+ - 4
49
+ - 5
50
+ - 6
51
+ - 7
52
+ - 8
53
+ - 9
54
+ - 10
55
+ - 11
56
+ - 12
57
+ - 13
58
+ - 14
59
+ - 15
60
+ mean_std_embedding_keys: null
61
+ modality_keys:
62
+ - arm_left
63
+ - arm_right
64
+ - head
65
+ sin_cos_embedding_keys: null
66
+ language: !!python/object:gr00t.data.types.ModalityConfig
67
+ action_configs: null
68
+ delta_indices:
69
+ - 0
70
+ mean_std_embedding_keys: null
71
+ modality_keys:
72
+ - annotation.human.task_description
73
+ sin_cos_embedding_keys: null
74
+ state: !!python/object:gr00t.data.types.ModalityConfig
75
+ action_configs: null
76
+ delta_indices:
77
+ - 0
78
+ mean_std_embedding_keys: null
79
+ modality_keys:
80
+ - arm_left
81
+ - arm_right
82
+ - head
83
+ sin_cos_embedding_keys: null
84
+ video: !!python/object:gr00t.data.types.ModalityConfig
85
+ action_configs: null
86
+ delta_indices:
87
+ - 0
88
+ mean_std_embedding_keys: null
89
+ modality_keys:
90
+ - cam_left_head
91
+ sin_cos_embedding_keys: null
92
+ mode: single_turn
93
+ multiprocessing_context: fork
94
+ num_shards_per_epoch: 100000
95
+ override_pretraining_statistics: false
96
+ random_chop: 0.0
97
+ seed: 42
98
+ shard_size: 1024
99
+ shuffle: true
100
+ subsample_ratio: 1.0
101
+ video_backend: torchcodec
102
+ load_config_path: null
103
+ model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
104
+ _attn_implementation_autoset: false
105
+ _attn_implementation_internal: null
106
+ _commit_hash: null
107
+ _name_or_path: ''
108
+ add_cross_attention: false
109
+ architectures: null
110
+ backbone_model_type: eagle
111
+ backbone_trainable_params_fp32: true
112
+ bad_words_ids: null
113
+ begin_suppress_tokens: null
114
+ bos_token_id: null
115
+ chunk_size_feed_forward: 0
116
+ color_jitter_params:
117
+ brightness: 0.3
118
+ contrast: 0.4
119
+ hue: 0.08
120
+ saturation: 0.5
121
+ cross_attention_hidden_size: null
122
+ decoder_start_token_id: null
123
+ diffusion_model_cfg:
124
+ attention_head_dim: 48
125
+ dropout: 0.2
126
+ final_dropout: true
127
+ interleave_self_attention: true
128
+ norm_type: ada_norm
129
+ num_attention_heads: 32
130
+ num_layers: 32
131
+ output_dim: 1024
132
+ positional_embeddings: null
133
+ diversity_penalty: 0.0
134
+ do_sample: false
135
+ eagle_collator: true
136
+ early_stopping: false
137
+ encoder_no_repeat_ngram_size: 0
138
+ eos_token_id: null
139
+ exponential_decay_length_penalty: null
140
+ extra_augmentation_config: null
141
+ finetuning_task: null
142
+ forced_bos_token_id: null
143
+ forced_eos_token_id: null
144
+ id2label:
145
+ 0: LABEL_0
146
+ 1: LABEL_1
147
+ is_decoder: false
148
+ is_encoder_decoder: false
149
+ label2id:
150
+ LABEL_0: 0
151
+ LABEL_1: 1
152
+ length_penalty: 1.0
153
+ load_bf16: false
154
+ max_length: 20
155
+ min_length: 0
156
+ model_name: nvidia/Eagle-Block2A-2B-v2
157
+ no_repeat_ngram_size: 0
158
+ num_beam_groups: 1
159
+ num_beams: 1
160
+ num_return_sequences: 1
161
+ output_attentions: false
162
+ output_hidden_states: false
163
+ output_scores: false
164
+ pad_token_id: null
165
+ prefix: null
166
+ problem_type: null
167
+ pruned_heads: {}
168
+ random_rotation_angle: null
169
+ remove_invalid_values: false
170
+ repetition_penalty: 1.0
171
+ reproject_vision: false
172
+ return_dict: true
173
+ return_dict_in_generate: false
174
+ sep_token_id: null
175
+ state_dropout_prob: 0.0
176
+ suppress_tokens: null
177
+ task_specific_params: null
178
+ temperature: 1.0
179
+ tf_legacy_loss: false
180
+ tie_encoder_decoder: false
181
+ tie_word_embeddings: true
182
+ tokenizer_class: null
183
+ top_k: 50
184
+ top_p: 1.0
185
+ torch_dtype: null
186
+ torchscript: false
187
+ transformers_version: null
188
+ tune_diffusion_model: true
189
+ tune_llm: false
190
+ tune_projector: true
191
+ tune_visual: false
192
+ typical_p: 1.0
193
+ use_bfloat16: false
194
+ use_relative_action: true
195
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
196
+ add_rl_callback: false
197
+ assert_loss_less_than: null
198
+ batch_size: null
199
+ bf16: true
200
+ dataloader_num_workers: 8
201
+ ddp_bucket_cap_mb: 100
202
+ deepspeed_stage: 2
203
+ enable_open_loop_eval: false
204
+ enable_profiling: false
205
+ eval_batch_size: 2
206
+ eval_bf16: true
207
+ eval_set_split_ratio: 0.1
208
+ eval_steps: 500
209
+ eval_strategy: 'no'
210
+ experiment_name: null
211
+ fp16: false
212
+ global_batch_size: 48
213
+ gradient_accumulation_steps: 1
214
+ gradient_checkpointing: false
215
+ learning_rate: 0.0001
216
+ logging_steps: 10
217
+ lr_scheduler_type: cosine
218
+ max_concurrent_uploads: 2
219
+ max_grad_norm: 1.0
220
+ max_retries: 3
221
+ max_steps: 80000
222
+ num_gpus: 1
223
+ open_loop_eval_plot_indices: null
224
+ open_loop_eval_steps_per_traj: 100
225
+ open_loop_eval_traj_ids:
226
+ - 0
227
+ optim: adamw_torch
228
+ output_dir: /data/checkpoints/task_513_head_track
229
+ remove_unused_columns: false
230
+ save_best_eval_metric_greater_is_better: true
231
+ save_best_eval_metric_name: ''
232
+ save_steps: 20000
233
+ save_total_limit: 10
234
+ save_vl_model: false
235
+ start_from_checkpoint: nvidia/GR00T-N1.6-3B
236
+ tf32: true
237
+ transformers_access_token: null
238
+ transformers_cache_dir: null
239
+ transformers_local_files_only: false
240
+ transformers_trust_remote_code: true
241
+ upload_checkpoints: false
242
+ upload_every: 1000
243
+ upload_last_n_checkpoints: 5
244
+ use_ddp: false
245
+ use_wandb: false
246
+ wandb_project: finetune-gr00t-n1d6
247
+ warmup_ratio: 0.05
248
+ warmup_steps: 0
249
+ weight_decay: 1.0e-05
experiment_cfg/dataset_statistics.json ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "state": {
4
+ "arm_left": {
5
+ "min": [
6
+ -0.32769665122032166,
7
+ -3.5952674807049334e-05,
8
+ -0.7671821117401123,
9
+ -2.6660585403442383,
10
+ -0.24030767381191254,
11
+ -0.4148099720478058,
12
+ -0.4670966565608978,
13
+ 0.016873788088560104
14
+ ],
15
+ "max": [
16
+ 1.1045020818710327,
17
+ 0.4448903799057007,
18
+ 0.5149860978126526,
19
+ -1.0441975593566895,
20
+ 0.5722227692604065,
21
+ 0.891901969909668,
22
+ 0.3085845410823822,
23
+ 1.1182719469070435
24
+ ],
25
+ "mean": [
26
+ 0.5965352058410645,
27
+ 0.222667396068573,
28
+ -0.001857798546552658,
29
+ -2.1536648273468018,
30
+ 0.1427067071199417,
31
+ -0.004120463039726019,
32
+ 0.04518925026059151,
33
+ 0.23380941152572632
34
+ ],
35
+ "std": [
36
+ 0.26479241251945496,
37
+ 0.06473368406295776,
38
+ 0.185212641954422,
39
+ 0.23029643297195435,
40
+ 0.08799401670694351,
41
+ 0.2201998233795166,
42
+ 0.0981917753815651,
43
+ 0.3348628580570221
44
+ ],
45
+ "q01": [
46
+ -0.11296354733407497,
47
+ 0.05661935109645128,
48
+ -0.6153852707147598,
49
+ -2.6107337498664855,
50
+ -0.12767501741647722,
51
+ -0.3016920751333237,
52
+ -0.2515653073787689,
53
+ 0.04448544234037399
54
+ ],
55
+ "q99": [
56
+ 1.0746320509910585,
57
+ 0.36018048554658905,
58
+ 0.2970529794692993,
59
+ -1.4099017357826231,
60
+ 0.34274882078170776,
61
+ 0.6542050760984429,
62
+ 0.23363251492381126,
63
+ 1.1136701107025146
64
+ ]
65
+ },
66
+ "arm_right": {
67
+ "min": [
68
+ -0.4739161729812622,
69
+ -0.5639536380767822,
70
+ -0.9659045338630676,
71
+ -2.7155895233154297,
72
+ -0.6204113364219666,
73
+ -0.635463535785675,
74
+ -0.2742881774902344,
75
+ 0.0
76
+ ],
77
+ "max": [
78
+ 1.1228739023208618,
79
+ -0.0014740596525371075,
80
+ 0.6081634759902954,
81
+ -1.0477927923202515,
82
+ 0.29277461767196655,
83
+ 1.1570649147033691,
84
+ 0.8721371293067932,
85
+ 1.1213399171829224
86
+ ],
87
+ "mean": [
88
+ 0.5744096040725708,
89
+ -0.1578063666820526,
90
+ -0.09115820378065109,
91
+ -2.188415765762329,
92
+ -0.15953116118907928,
93
+ 0.11276087909936905,
94
+ 0.14848783612251282,
95
+ 0.3304852247238159
96
+ ],
97
+ "std": [
98
+ 0.24306164681911469,
99
+ 0.09848367422819138,
100
+ 0.2565942108631134,
101
+ 0.2016150951385498,
102
+ 0.13883453607559204,
103
+ 0.3072238266468048,
104
+ 0.19907903671264648,
105
+ 0.40453416109085083
106
+ ],
107
+ "q01": [
108
+ -0.18488698646426202,
109
+ -0.4454313454031944,
110
+ -0.700841807126999,
111
+ -2.598863844871521,
112
+ -0.4912299048900604,
113
+ -0.3342285332083702,
114
+ -0.16355043157935142,
115
+ 0.023009711876511574
116
+ ],
117
+ "q99": [
118
+ 1.046805627346039,
119
+ -0.0015219965716823936,
120
+ 0.43641214400529876,
121
+ -1.529058262109756,
122
+ 0.13974876329302802,
123
+ 0.8599527513980867,
124
+ 0.6667004805803299,
125
+ 1.1182719469070435
126
+ ]
127
+ },
128
+ "head": {
129
+ "min": [
130
+ 0.6596117615699768,
131
+ -0.3497476279735565
132
+ ],
133
+ "max": [
134
+ 0.8620972037315369,
135
+ 0.3497476279735565
136
+ ],
137
+ "mean": [
138
+ 0.6913227438926697,
139
+ -0.004876141902059317
140
+ ],
141
+ "std": [
142
+ 0.015439425595103992,
143
+ 0.1833014339208603
144
+ ],
145
+ "q01": [
146
+ 0.6810874938964844,
147
+ -0.3466796576976776
148
+ ],
149
+ "q99": [
150
+ 0.6933593153953552,
151
+ 0.34207770228385925
152
+ ]
153
+ }
154
+ },
155
+ "action": {
156
+ "arm_left": {
157
+ "min": [
158
+ -0.32827189564704895,
159
+ -0.058291271328926086,
160
+ -0.7669904232025146,
161
+ -2.6660585403442383,
162
+ -0.24083498120307922,
163
+ -0.4332428276538849,
164
+ -0.4663301706314087,
165
+ -0.2982214093208313
166
+ ],
167
+ "max": [
168
+ 1.104466199874878,
169
+ 0.4448544383049011,
170
+ 0.5154175162315369,
171
+ -1.04157292842865,
172
+ 0.5660389065742493,
173
+ 0.8921166062355042,
174
+ 0.30833014845848083,
175
+ 2.3138411045074463
176
+ ],
177
+ "mean": [
178
+ 0.5970768332481384,
179
+ 0.22290204465389252,
180
+ -0.0018270841101184487,
181
+ -2.154219627380371,
182
+ 0.14274784922599792,
183
+ -0.004338364116847515,
184
+ 0.0453636571764946,
185
+ 0.35635656118392944
186
+ ],
187
+ "std": [
188
+ 0.26535090804100037,
189
+ 0.0649295374751091,
190
+ 0.1859433650970459,
191
+ 0.23075419664382935,
192
+ 0.08828757703304291,
193
+ 0.22086213529109955,
194
+ 0.09835304319858551,
195
+ 0.696665346622467
196
+ ],
197
+ "q01": [
198
+ -0.11658254265785217,
199
+ 0.05675728991627693,
200
+ -0.6174426293373108,
201
+ -2.610835313796997,
202
+ -0.13038836419582367,
203
+ -0.30132046341896057,
204
+ -0.251572847366333,
205
+ 0.01900581456720829
206
+ ],
207
+ "q99": [
208
+ 1.0753204822540283,
209
+ 0.3604854941368103,
210
+ 0.2991262674331665,
211
+ -1.4066604375839233,
212
+ 0.34514567255973816,
213
+ 0.6558835506439209,
214
+ 0.2346990555524826,
215
+ 2.300342082977295
216
+ ]
217
+ },
218
+ "arm_right": {
219
+ "min": [
220
+ -0.47400006651878357,
221
+ -0.5675728917121887,
222
+ -0.9679418802261353,
223
+ -2.719748020172119,
224
+ -0.6212621927261353,
225
+ -0.6510680913925171,
226
+ -0.27458256483078003,
227
+ -0.12610876560211182
228
+ ],
229
+ "max": [
230
+ 1.1228739023208618,
231
+ 0.11965049803256989,
232
+ 0.6089903712272644,
233
+ -1.0477088689804077,
234
+ 0.30066022276878357,
235
+ 1.159029245376587,
236
+ 0.8728350400924683,
237
+ 2.3307149410247803
238
+ ],
239
+ "mean": [
240
+ 0.5744243860244751,
241
+ -0.15571700036525726,
242
+ -0.09068138897418976,
243
+ -2.1886820793151855,
244
+ -0.15848886966705322,
245
+ 0.11278273165225983,
246
+ 0.14838500320911407,
247
+ 0.5432578921318054
248
+ ],
249
+ "std": [
250
+ 0.24373859167099,
251
+ 0.10171724110841751,
252
+ 0.2581823766231537,
253
+ 0.20275290310382788,
254
+ 0.13970211148262024,
255
+ 0.3082216680049896,
256
+ 0.199318066239357,
257
+ 0.8717935681343079
258
+ ],
259
+ "q01": [
260
+ -0.18792798280715942,
261
+ -0.4463884234428406,
262
+ -0.7040972113609314,
263
+ -2.6016314029693604,
264
+ -0.4939418137073517,
265
+ -0.3350680470466614,
266
+ -0.16338429510593414,
267
+ -0.045114584267139435
268
+ ],
269
+ "q99": [
270
+ 1.0461748838424683,
271
+ 0.0475534051656723,
272
+ 0.4387184977531433,
273
+ -1.526310920715332,
274
+ 0.14112623035907745,
275
+ 0.8614369630813599,
276
+ 0.666529974937442,
277
+ 2.3138411045074463
278
+ ]
279
+ },
280
+ "head": {
281
+ "min": [
282
+ 0.658930778503418,
283
+ -0.3499999940395355
284
+ ],
285
+ "max": [
286
+ 0.6951000094413757,
287
+ 0.3499999940395355
288
+ ],
289
+ "mean": [
290
+ 0.695273756980896,
291
+ -0.004491189029067755
292
+ ],
293
+ "std": [
294
+ 0.00047280511353174946,
295
+ 0.184040829539299
296
+ ],
297
+ "q01": [
298
+ 0.6951000094413757,
299
+ -0.3499999940395355
300
+ ],
301
+ "q99": [
302
+ 0.6951000094413757,
303
+ 0.3499999940395355
304
+ ]
305
+ }
306
+ },
307
+ "relative_action": {}
308
+ }
309
+ }
experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d6",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
+ "backbone_model_type": "eagle",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 4,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": true,
15
+ "collator_overwrite_image_inputs": false,
16
+ "eagle_collator": true,
17
+ "backbone_trainable_params_fp32": true,
18
+ "extra_augmentation_config": null,
19
+ "apply_sincos_state_encoding": true,
20
+ "use_relative_action": true,
21
+ "max_state_dim": 128,
22
+ "max_action_dim": 128,
23
+ "action_horizon": 50,
24
+ "hidden_size": 1024,
25
+ "input_embedding_dim": 1536,
26
+ "add_pos_embed": true,
27
+ "attn_dropout": 0.2,
28
+ "use_vlln": true,
29
+ "max_seq_len": 1024,
30
+ "use_alternate_vl_dit": true,
31
+ "attend_text_every_n_blocks": 2,
32
+ "diffusion_model_cfg": {
33
+ "attention_head_dim": 48,
34
+ "dropout": 0.2,
35
+ "final_dropout": true,
36
+ "interleave_self_attention": true,
37
+ "norm_type": "ada_norm",
38
+ "num_attention_heads": 32,
39
+ "num_layers": 32,
40
+ "output_dim": 1024,
41
+ "positional_embeddings": null
42
+ },
43
+ "num_inference_timesteps": 4,
44
+ "noise_beta_alpha": 1.5,
45
+ "noise_beta_beta": 1.0,
46
+ "noise_s": 0.999,
47
+ "num_timestep_buckets": 1000,
48
+ "tune_projector": true,
49
+ "tune_diffusion_model": true,
50
+ "tune_vlln": true,
51
+ "state_dropout_prob": 0.0,
52
+ "state_additive_noise_scale": 0.0,
53
+ "max_num_embodiments": 32
54
+ }
experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2754c6315a8b9c176529e8d83f1be61fdddefc9190b4ae4493a5010185fd865
3
+ size 4990120184
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e8bb2df7b513bdcc1271220de1224d1c7e1df7edada56fde2a335469bc9a6d2
3
+ size 4823190320
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
processor_config.json ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "action_configs": null
18
+ },
19
+ "state": {
20
+ "delta_indices": [
21
+ 0
22
+ ],
23
+ "modality_keys": [
24
+ "robot_pos",
25
+ "robot_ori_cos",
26
+ "robot_ori_sin",
27
+ "robot_2d_ori",
28
+ "robot_2d_ori_cos",
29
+ "robot_2d_ori_sin",
30
+ "robot_lin_vel",
31
+ "robot_ang_vel",
32
+ "arm_left_qpos",
33
+ "arm_left_qpos_sin",
34
+ "arm_left_qpos_cos",
35
+ "eef_left_pos",
36
+ "eef_left_quat",
37
+ "gripper_left_qpos",
38
+ "arm_right_qpos",
39
+ "arm_right_qpos_sin",
40
+ "arm_right_qpos_cos",
41
+ "eef_right_pos",
42
+ "eef_right_quat",
43
+ "gripper_right_qpos",
44
+ "trunk_qpos"
45
+ ],
46
+ "sin_cos_embedding_keys": null,
47
+ "mean_std_embedding_keys": null,
48
+ "action_configs": null
49
+ },
50
+ "action": {
51
+ "delta_indices": [
52
+ 0,
53
+ 1,
54
+ 2,
55
+ 3,
56
+ 4,
57
+ 5,
58
+ 6,
59
+ 7,
60
+ 8,
61
+ 9,
62
+ 10,
63
+ 11,
64
+ 12,
65
+ 13,
66
+ 14,
67
+ 15,
68
+ 16,
69
+ 17,
70
+ 18,
71
+ 19,
72
+ 20,
73
+ 21,
74
+ 22,
75
+ 23,
76
+ 24,
77
+ 25,
78
+ 26,
79
+ 27,
80
+ 28,
81
+ 29,
82
+ 30,
83
+ 31
84
+ ],
85
+ "modality_keys": [
86
+ "base",
87
+ "torso",
88
+ "left_arm",
89
+ "left_gripper",
90
+ "right_arm",
91
+ "right_gripper"
92
+ ],
93
+ "sin_cos_embedding_keys": null,
94
+ "mean_std_embedding_keys": null,
95
+ "action_configs": [
96
+ {
97
+ "rep": "ABSOLUTE",
98
+ "type": "NON_EEF",
99
+ "format": "DEFAULT",
100
+ "state_key": null
101
+ },
102
+ {
103
+ "rep": "RELATIVE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": "trunk_qpos"
107
+ },
108
+ {
109
+ "rep": "RELATIVE",
110
+ "type": "NON_EEF",
111
+ "format": "DEFAULT",
112
+ "state_key": "arm_left_qpos"
113
+ },
114
+ {
115
+ "rep": "ABSOLUTE",
116
+ "type": "NON_EEF",
117
+ "format": "DEFAULT",
118
+ "state_key": null
119
+ },
120
+ {
121
+ "rep": "RELATIVE",
122
+ "type": "NON_EEF",
123
+ "format": "DEFAULT",
124
+ "state_key": "arm_right_qpos"
125
+ },
126
+ {
127
+ "rep": "ABSOLUTE",
128
+ "type": "NON_EEF",
129
+ "format": "DEFAULT",
130
+ "state_key": null
131
+ }
132
+ ]
133
+ },
134
+ "language": {
135
+ "delta_indices": [
136
+ 0
137
+ ],
138
+ "modality_keys": [
139
+ "annotation.human.coarse_action"
140
+ ],
141
+ "sin_cos_embedding_keys": null,
142
+ "mean_std_embedding_keys": null,
143
+ "action_configs": null
144
+ }
145
+ },
146
+ "gr1": {
147
+ "video": {
148
+ "delta_indices": [
149
+ 0
150
+ ],
151
+ "modality_keys": [
152
+ "ego_view_bg_crop_pad_res256_freq20"
153
+ ],
154
+ "sin_cos_embedding_keys": null,
155
+ "mean_std_embedding_keys": null,
156
+ "action_configs": null
157
+ },
158
+ "state": {
159
+ "delta_indices": [
160
+ 0
161
+ ],
162
+ "modality_keys": [
163
+ "left_arm",
164
+ "right_arm",
165
+ "left_hand",
166
+ "right_hand",
167
+ "waist"
168
+ ],
169
+ "sin_cos_embedding_keys": [
170
+ "left_arm",
171
+ "right_arm",
172
+ "left_hand",
173
+ "right_hand",
174
+ "waist"
175
+ ],
176
+ "mean_std_embedding_keys": null,
177
+ "action_configs": null
178
+ },
179
+ "action": {
180
+ "delta_indices": [
181
+ 0,
182
+ 1,
183
+ 2,
184
+ 3,
185
+ 4,
186
+ 5,
187
+ 6,
188
+ 7,
189
+ 8,
190
+ 9,
191
+ 10,
192
+ 11,
193
+ 12,
194
+ 13,
195
+ 14,
196
+ 15
197
+ ],
198
+ "modality_keys": [
199
+ "left_arm",
200
+ "right_arm",
201
+ "left_hand",
202
+ "right_hand",
203
+ "waist"
204
+ ],
205
+ "sin_cos_embedding_keys": null,
206
+ "mean_std_embedding_keys": null,
207
+ "action_configs": [
208
+ {
209
+ "rep": "RELATIVE",
210
+ "type": "NON_EEF",
211
+ "format": "DEFAULT",
212
+ "state_key": null
213
+ },
214
+ {
215
+ "rep": "RELATIVE",
216
+ "type": "NON_EEF",
217
+ "format": "DEFAULT",
218
+ "state_key": null
219
+ },
220
+ {
221
+ "rep": "RELATIVE",
222
+ "type": "NON_EEF",
223
+ "format": "DEFAULT",
224
+ "state_key": null
225
+ },
226
+ {
227
+ "rep": "RELATIVE",
228
+ "type": "NON_EEF",
229
+ "format": "DEFAULT",
230
+ "state_key": null
231
+ },
232
+ {
233
+ "rep": "ABSOLUTE",
234
+ "type": "NON_EEF",
235
+ "format": "DEFAULT",
236
+ "state_key": null
237
+ }
238
+ ]
239
+ },
240
+ "language": {
241
+ "delta_indices": [
242
+ 0
243
+ ],
244
+ "modality_keys": [
245
+ "task"
246
+ ],
247
+ "sin_cos_embedding_keys": null,
248
+ "mean_std_embedding_keys": null,
249
+ "action_configs": null
250
+ }
251
+ },
252
+ "robocasa_panda_omron": {
253
+ "video": {
254
+ "delta_indices": [
255
+ 0
256
+ ],
257
+ "modality_keys": [
258
+ "res256_image_side_0",
259
+ "res256_image_side_1",
260
+ "res256_image_wrist_0"
261
+ ],
262
+ "sin_cos_embedding_keys": null,
263
+ "mean_std_embedding_keys": null,
264
+ "action_configs": null
265
+ },
266
+ "state": {
267
+ "delta_indices": [
268
+ 0
269
+ ],
270
+ "modality_keys": [
271
+ "end_effector_position_relative",
272
+ "end_effector_rotation_relative",
273
+ "gripper_qpos",
274
+ "base_position",
275
+ "base_rotation"
276
+ ],
277
+ "sin_cos_embedding_keys": null,
278
+ "mean_std_embedding_keys": null,
279
+ "action_configs": null
280
+ },
281
+ "action": {
282
+ "delta_indices": [
283
+ 0,
284
+ 1,
285
+ 2,
286
+ 3,
287
+ 4,
288
+ 5,
289
+ 6,
290
+ 7,
291
+ 8,
292
+ 9,
293
+ 10,
294
+ 11,
295
+ 12,
296
+ 13,
297
+ 14,
298
+ 15
299
+ ],
300
+ "modality_keys": [
301
+ "end_effector_position",
302
+ "end_effector_rotation",
303
+ "gripper_close",
304
+ "base_motion",
305
+ "control_mode"
306
+ ],
307
+ "sin_cos_embedding_keys": null,
308
+ "mean_std_embedding_keys": null,
309
+ "action_configs": [
310
+ {
311
+ "rep": "ABSOLUTE",
312
+ "type": "NON_EEF",
313
+ "format": "DEFAULT",
314
+ "state_key": null
315
+ },
316
+ {
317
+ "rep": "ABSOLUTE",
318
+ "type": "NON_EEF",
319
+ "format": "DEFAULT",
320
+ "state_key": null
321
+ },
322
+ {
323
+ "rep": "ABSOLUTE",
324
+ "type": "NON_EEF",
325
+ "format": "DEFAULT",
326
+ "state_key": null
327
+ },
328
+ {
329
+ "rep": "ABSOLUTE",
330
+ "type": "NON_EEF",
331
+ "format": "DEFAULT",
332
+ "state_key": null
333
+ },
334
+ {
335
+ "rep": "ABSOLUTE",
336
+ "type": "NON_EEF",
337
+ "format": "DEFAULT",
338
+ "state_key": null
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "annotation.human.action.task_description"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "action_configs": null
352
+ }
353
+ },
354
+ "new_embodiment": {
355
+ "video": {
356
+ "delta_indices": [
357
+ 0
358
+ ],
359
+ "modality_keys": [
360
+ "cam_left_head"
361
+ ],
362
+ "sin_cos_embedding_keys": null,
363
+ "mean_std_embedding_keys": null,
364
+ "action_configs": null
365
+ },
366
+ "state": {
367
+ "delta_indices": [
368
+ 0
369
+ ],
370
+ "modality_keys": [
371
+ "arm_left",
372
+ "arm_right",
373
+ "head"
374
+ ],
375
+ "sin_cos_embedding_keys": null,
376
+ "mean_std_embedding_keys": null,
377
+ "action_configs": null
378
+ },
379
+ "action": {
380
+ "delta_indices": [
381
+ 0,
382
+ 1,
383
+ 2,
384
+ 3,
385
+ 4,
386
+ 5,
387
+ 6,
388
+ 7,
389
+ 8,
390
+ 9,
391
+ 10,
392
+ 11,
393
+ 12,
394
+ 13,
395
+ 14,
396
+ 15
397
+ ],
398
+ "modality_keys": [
399
+ "arm_left",
400
+ "arm_right",
401
+ "head"
402
+ ],
403
+ "sin_cos_embedding_keys": null,
404
+ "mean_std_embedding_keys": null,
405
+ "action_configs": [
406
+ {
407
+ "rep": "ABSOLUTE",
408
+ "type": "NON_EEF",
409
+ "format": "DEFAULT",
410
+ "state_key": null
411
+ },
412
+ {
413
+ "rep": "ABSOLUTE",
414
+ "type": "NON_EEF",
415
+ "format": "DEFAULT",
416
+ "state_key": null
417
+ },
418
+ {
419
+ "rep": "ABSOLUTE",
420
+ "type": "NON_EEF",
421
+ "format": "DEFAULT",
422
+ "state_key": null
423
+ }
424
+ ]
425
+ },
426
+ "language": {
427
+ "delta_indices": [
428
+ 0
429
+ ],
430
+ "modality_keys": [
431
+ "annotation.human.task_description"
432
+ ],
433
+ "sin_cos_embedding_keys": null,
434
+ "mean_std_embedding_keys": null,
435
+ "action_configs": null
436
+ }
437
+ }
438
+ },
439
+ "image_crop_size": null,
440
+ "image_target_size": null,
441
+ "use_albumentations": true,
442
+ "random_rotation_angle": null,
443
+ "color_jitter_params": {
444
+ "brightness": 0.3,
445
+ "contrast": 0.4,
446
+ "saturation": 0.5,
447
+ "hue": 0.08
448
+ },
449
+ "shortest_image_edge": 256,
450
+ "crop_fraction": 0.95,
451
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
452
+ "model_type": "eagle",
453
+ "formalize_language": true,
454
+ "max_state_dim": 128,
455
+ "max_action_dim": 128,
456
+ "max_action_horizon": 50,
457
+ "use_percentiles": false,
458
+ "clip_outliers": true,
459
+ "apply_sincos_state_encoding": true,
460
+ "use_relative_action": true
461
+ }
462
+ }
statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd6e9d532967de7cf6f41cf03f2bbd5910e31877e507c7e7a35b529ab41a6df6
3
+ size 5777
wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d6", "run_id": "task_513_head_track"}