ry-5 commited on
Commit
2a4e638
·
verified ·
1 Parent(s): 8924dbd

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-10000/config.json +70 -0
  2. checkpoint-10000/embodiment_id.json +11 -0
  3. checkpoint-10000/experiment_cfg/conf.yaml +206 -0
  4. checkpoint-10000/experiment_cfg/config.yaml +239 -0
  5. checkpoint-10000/experiment_cfg/dataset_statistics.json +149 -0
  6. checkpoint-10000/experiment_cfg/final_model_config.json +54 -0
  7. checkpoint-10000/experiment_cfg/final_processor_config.json +0 -0
  8. checkpoint-10000/model-00001-of-00002.safetensors +3 -0
  9. checkpoint-10000/model-00002-of-00002.safetensors +3 -0
  10. checkpoint-10000/model.safetensors.index.json +0 -0
  11. checkpoint-10000/optimizer.pt +3 -0
  12. checkpoint-10000/processor_config.json +455 -0
  13. checkpoint-10000/rng_state.pth +3 -0
  14. checkpoint-10000/scheduler.pt +3 -0
  15. checkpoint-10000/statistics.json +0 -0
  16. checkpoint-10000/trainer_state.json +0 -0
  17. checkpoint-10000/training_args.bin +3 -0
  18. checkpoint-10000/wandb_config.json +1 -0
  19. checkpoint-15000/config.json +70 -0
  20. checkpoint-15000/embodiment_id.json +11 -0
  21. checkpoint-15000/experiment_cfg/conf.yaml +206 -0
  22. checkpoint-15000/experiment_cfg/config.yaml +239 -0
  23. checkpoint-15000/experiment_cfg/dataset_statistics.json +149 -0
  24. checkpoint-15000/experiment_cfg/final_model_config.json +54 -0
  25. checkpoint-15000/experiment_cfg/final_processor_config.json +0 -0
  26. checkpoint-15000/model-00001-of-00002.safetensors +3 -0
  27. checkpoint-15000/model-00002-of-00002.safetensors +3 -0
  28. checkpoint-15000/model.safetensors.index.json +0 -0
  29. checkpoint-15000/optimizer.pt +3 -0
  30. checkpoint-15000/processor_config.json +455 -0
  31. checkpoint-15000/rng_state.pth +3 -0
  32. checkpoint-15000/scheduler.pt +3 -0
  33. checkpoint-15000/statistics.json +0 -0
  34. checkpoint-15000/trainer_state.json +0 -0
  35. checkpoint-15000/training_args.bin +3 -0
  36. checkpoint-15000/wandb_config.json +1 -0
  37. checkpoint-20000/config.json +70 -0
  38. checkpoint-20000/embodiment_id.json +11 -0
  39. checkpoint-20000/experiment_cfg/conf.yaml +206 -0
  40. checkpoint-20000/experiment_cfg/config.yaml +239 -0
  41. checkpoint-20000/experiment_cfg/dataset_statistics.json +149 -0
  42. checkpoint-20000/experiment_cfg/final_model_config.json +54 -0
  43. checkpoint-20000/experiment_cfg/final_processor_config.json +0 -0
  44. checkpoint-20000/model-00001-of-00002.safetensors +3 -0
  45. checkpoint-20000/model-00002-of-00002.safetensors +3 -0
  46. checkpoint-20000/model.safetensors.index.json +0 -0
  47. checkpoint-20000/optimizer.pt +3 -0
  48. checkpoint-20000/processor_config.json +455 -0
  49. checkpoint-20000/rng_state.pth +3 -0
  50. checkpoint-20000/scheduler.pt +3 -0
checkpoint-10000/config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": true,
5
+ "architectures": [
6
+ "Gr00tN1d6"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_model_type": "eagle",
12
+ "backbone_trainable_params_fp32": true,
13
+ "collator_overwrite_image_inputs": false,
14
+ "color_jitter_params": {
15
+ "brightness": 0.1,
16
+ "contrast": 0.1,
17
+ "hue": 0.1,
18
+ "saturation": 0.1
19
+ },
20
+ "crop_fraction": 0.95,
21
+ "diffusion_model_cfg": {
22
+ "attention_head_dim": 48,
23
+ "dropout": 0.2,
24
+ "final_dropout": true,
25
+ "interleave_self_attention": true,
26
+ "norm_type": "ada_norm",
27
+ "num_attention_heads": 32,
28
+ "num_layers": 32,
29
+ "output_dim": 1024,
30
+ "positional_embeddings": null
31
+ },
32
+ "eagle_collator": true,
33
+ "formalize_language": true,
34
+ "gemma_collator": false,
35
+ "hidden_size": 1024,
36
+ "image_crop_size": null,
37
+ "image_target_size": null,
38
+ "input_embedding_dim": 1536,
39
+ "load_bf16": true,
40
+ "max_action_dim": 128,
41
+ "max_num_embodiments": 32,
42
+ "max_seq_len": 1024,
43
+ "max_state_dim": 128,
44
+ "model_dtype": "bfloat16",
45
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
+ "model_type": "Gr00tN1d6",
47
+ "noise_beta_alpha": 1.5,
48
+ "noise_beta_beta": 1.0,
49
+ "noise_s": 0.999,
50
+ "num_inference_timesteps": 4,
51
+ "num_timestep_buckets": 1000,
52
+ "random_rotation_angle": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 16,
55
+ "shortest_image_edge": 256,
56
+ "state_dropout_prob": 0.0,
57
+ "torch_dtype": "bfloat16",
58
+ "transformers_version": "4.51.3",
59
+ "tune_diffusion_model": true,
60
+ "tune_llm": false,
61
+ "tune_projector": true,
62
+ "tune_top_llm_layers": 4,
63
+ "tune_visual": false,
64
+ "tune_vlln": true,
65
+ "use_albumentations_transforms": true,
66
+ "use_alternate_vl_dit": true,
67
+ "use_flash_attention": true,
68
+ "use_relative_action": true,
69
+ "use_vlln": true
70
+ }
checkpoint-10000/embodiment_id.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "oxe_droid": 16,
10
+ "new_embodiment": 10
11
+ }
checkpoint-10000/experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d6
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Eagle-Block2A-2B-v2
6
+ backbone_model_type: eagle
7
+ model_revision: null
8
+ tune_top_llm_layers: 4
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 16
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ collator_overwrite_image_inputs: false
17
+ eagle_collator: true
18
+ backbone_trainable_params_fp32: true
19
+ image_crop_size: null
20
+ image_target_size: null
21
+ shortest_image_edge: 256
22
+ crop_fraction: 0.95
23
+ random_rotation_angle: null
24
+ color_jitter_params: null
25
+ use_albumentations_transforms: true
26
+ extra_augmentation_config: null
27
+ formalize_language: true
28
+ apply_sincos_state_encoding: false
29
+ use_relative_action: true
30
+ max_state_dim: 29
31
+ max_action_dim: 29
32
+ action_horizon: 16
33
+ hidden_size: 1024
34
+ input_embedding_dim: 1536
35
+ add_pos_embed: true
36
+ attn_dropout: 0.2
37
+ use_vlln: true
38
+ max_seq_len: 1024
39
+ use_alternate_vl_dit: true
40
+ attend_text_every_n_blocks: 2
41
+ diffusion_model_cfg:
42
+ positional_embeddings: null
43
+ num_layers: 32
44
+ num_attention_heads: 32
45
+ attention_head_dim: 48
46
+ norm_type: ada_norm
47
+ dropout: 0.2
48
+ final_dropout: true
49
+ output_dim: 1024
50
+ interleave_self_attention: true
51
+ num_inference_timesteps: 4
52
+ noise_beta_alpha: 1.5
53
+ noise_beta_beta: 1.0
54
+ noise_s: 0.999
55
+ num_timestep_buckets: 1000
56
+ tune_projector: true
57
+ tune_diffusion_model: true
58
+ tune_vlln: true
59
+ state_dropout_prob: 0.0
60
+ state_additive_noise_scale: 0.0
61
+ max_num_embodiments: 32
62
+ data:
63
+ datasets:
64
+ - dataset_paths:
65
+ - ./cherry_data
66
+ embodiment_tag: new_embodiment
67
+ mix_ratio: 1.0
68
+ dataset_type: physical_embodiment
69
+ val_dataset_path: null
70
+ modality_configs:
71
+ new_embodiment:
72
+ video:
73
+ delta_indices:
74
+ - 0
75
+ modality_keys:
76
+ - cam_base
77
+ - cam_wrist
78
+ sin_cos_embedding_keys: null
79
+ mean_std_embedding_keys: null
80
+ action_configs: null
81
+ state:
82
+ delta_indices:
83
+ - 0
84
+ modality_keys:
85
+ - arm
86
+ - gripper
87
+ sin_cos_embedding_keys: null
88
+ mean_std_embedding_keys: null
89
+ action_configs: null
90
+ action:
91
+ delta_indices:
92
+ - 0
93
+ - 1
94
+ - 2
95
+ - 3
96
+ - 4
97
+ - 5
98
+ - 6
99
+ - 7
100
+ - 8
101
+ - 9
102
+ - 10
103
+ - 11
104
+ - 12
105
+ - 13
106
+ - 14
107
+ - 15
108
+ modality_keys:
109
+ - arm
110
+ - gripper
111
+ sin_cos_embedding_keys: null
112
+ mean_std_embedding_keys: null
113
+ action_configs:
114
+ - rep: ABSOLUTE
115
+ type: NON_EEF
116
+ format: DEFAULT
117
+ state_key: null
118
+ - rep: ABSOLUTE
119
+ type: NON_EEF
120
+ format: DEFAULT
121
+ state_key: null
122
+ language:
123
+ delta_indices:
124
+ - 0
125
+ modality_keys:
126
+ - annotation.human.task_description
127
+ sin_cos_embedding_keys: null
128
+ mean_std_embedding_keys: null
129
+ action_configs: null
130
+ download_cache: false
131
+ shard_size: 1024
132
+ episode_sampling_rate: 0.1
133
+ num_shards_per_epoch: 100000
134
+ override_pretraining_statistics: false
135
+ mode: single_turn
136
+ random_chop: 0.0
137
+ mock_dataset_mode: false
138
+ shuffle: true
139
+ seed: 42
140
+ multiprocessing_context: fork
141
+ allow_padding: false
142
+ subsample_ratio: 1.0
143
+ image_crop_size:
144
+ - 244
145
+ - 244
146
+ image_target_size:
147
+ - 224
148
+ - 224
149
+ video_backend: decord
150
+ training:
151
+ output_dir: ./outputs/fr5_cherry
152
+ experiment_name: null
153
+ max_steps: 20000
154
+ global_batch_size: 32
155
+ batch_size: null
156
+ gradient_accumulation_steps: 1
157
+ learning_rate: 0.0001
158
+ lr_scheduler_type: cosine
159
+ weight_decay: 1.0e-05
160
+ warmup_ratio: 0.05
161
+ warmup_steps: 0
162
+ max_grad_norm: 1.0
163
+ optim: adamw_torch
164
+ start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
165
+ tf32: true
166
+ fp16: false
167
+ bf16: true
168
+ eval_bf16: true
169
+ logging_steps: 10
170
+ save_steps: 5000
171
+ save_total_limit: 5
172
+ save_vl_model: false
173
+ upload_checkpoints: false
174
+ upload_every: 1000
175
+ upload_last_n_checkpoints: 5
176
+ max_concurrent_uploads: 2
177
+ eval_strategy: 'no'
178
+ eval_steps: 500
179
+ eval_set_split_ratio: 0.1
180
+ eval_batch_size: 2
181
+ save_best_eval_metric_name: ''
182
+ save_best_eval_metric_greater_is_better: true
183
+ deepspeed_stage: 2
184
+ gradient_checkpointing: false
185
+ transformers_trust_remote_code: true
186
+ transformers_local_files_only: false
187
+ transformers_cache_dir: null
188
+ transformers_access_token: null
189
+ use_ddp: false
190
+ ddp_bucket_cap_mb: 100
191
+ num_gpus: 1
192
+ dataloader_num_workers: 2
193
+ remove_unused_columns: false
194
+ use_wandb: false
195
+ wandb_project: finetune-gr00t-n1d6
196
+ enable_profiling: false
197
+ max_retries: 3
198
+ assert_loss_less_than: null
199
+ add_rl_callback: false
200
+ enable_open_loop_eval: false
201
+ open_loop_eval_traj_ids:
202
+ - 0
203
+ open_loop_eval_steps_per_traj: 100
204
+ open_loop_eval_plot_indices: null
205
+ max_steps: 20000
206
+ save_steps: 5000
checkpoint-10000/experiment_cfg/config.yaml ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - ./cherry_data
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: new_embodiment
10
+ mix_ratio: 1.0
11
+ val_dataset_path: null
12
+ download_cache: false
13
+ episode_sampling_rate: 0.1
14
+ image_crop_size:
15
+ - 244
16
+ - 244
17
+ image_target_size:
18
+ - 224
19
+ - 224
20
+ mock_dataset_mode: false
21
+ modality_configs:
22
+ new_embodiment:
23
+ action: !!python/object:gr00t.data.types.ModalityConfig
24
+ action_configs:
25
+ - !!python/object:gr00t.data.types.ActionConfig
26
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
27
+ - default
28
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
29
+ - absolute
30
+ state_key: null
31
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
32
+ - non_eef
33
+ - !!python/object:gr00t.data.types.ActionConfig
34
+ format: *id001
35
+ rep: *id002
36
+ state_key: null
37
+ type: *id003
38
+ delta_indices:
39
+ - 0
40
+ - 1
41
+ - 2
42
+ - 3
43
+ - 4
44
+ - 5
45
+ - 6
46
+ - 7
47
+ - 8
48
+ - 9
49
+ - 10
50
+ - 11
51
+ - 12
52
+ - 13
53
+ - 14
54
+ - 15
55
+ mean_std_embedding_keys: null
56
+ modality_keys:
57
+ - arm
58
+ - gripper
59
+ sin_cos_embedding_keys: null
60
+ language: !!python/object:gr00t.data.types.ModalityConfig
61
+ action_configs: null
62
+ delta_indices:
63
+ - 0
64
+ mean_std_embedding_keys: null
65
+ modality_keys:
66
+ - annotation.human.task_description
67
+ sin_cos_embedding_keys: null
68
+ state: !!python/object:gr00t.data.types.ModalityConfig
69
+ action_configs: null
70
+ delta_indices:
71
+ - 0
72
+ mean_std_embedding_keys: null
73
+ modality_keys:
74
+ - arm
75
+ - gripper
76
+ sin_cos_embedding_keys: null
77
+ video: !!python/object:gr00t.data.types.ModalityConfig
78
+ action_configs: null
79
+ delta_indices:
80
+ - 0
81
+ mean_std_embedding_keys: null
82
+ modality_keys:
83
+ - cam_base
84
+ - cam_wrist
85
+ sin_cos_embedding_keys: null
86
+ mode: single_turn
87
+ multiprocessing_context: fork
88
+ num_shards_per_epoch: 100000
89
+ override_pretraining_statistics: false
90
+ random_chop: 0.0
91
+ seed: 42
92
+ shard_size: 1024
93
+ shuffle: true
94
+ subsample_ratio: 1.0
95
+ video_backend: decord
96
+ load_config_path: null
97
+ model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
98
+ _attn_implementation_autoset: false
99
+ _attn_implementation_internal: null
100
+ _commit_hash: null
101
+ _name_or_path: ''
102
+ add_cross_attention: false
103
+ architectures: null
104
+ backbone_model_type: eagle
105
+ backbone_trainable_params_fp32: true
106
+ bad_words_ids: null
107
+ begin_suppress_tokens: null
108
+ bos_token_id: null
109
+ chunk_size_feed_forward: 0
110
+ color_jitter_params: null
111
+ cross_attention_hidden_size: null
112
+ decoder_start_token_id: null
113
+ diffusion_model_cfg:
114
+ attention_head_dim: 48
115
+ dropout: 0.2
116
+ final_dropout: true
117
+ interleave_self_attention: true
118
+ norm_type: ada_norm
119
+ num_attention_heads: 32
120
+ num_layers: 32
121
+ output_dim: 1024
122
+ positional_embeddings: null
123
+ diversity_penalty: 0.0
124
+ do_sample: false
125
+ eagle_collator: true
126
+ early_stopping: false
127
+ encoder_no_repeat_ngram_size: 0
128
+ eos_token_id: null
129
+ exponential_decay_length_penalty: null
130
+ extra_augmentation_config: null
131
+ finetuning_task: null
132
+ forced_bos_token_id: null
133
+ forced_eos_token_id: null
134
+ id2label:
135
+ 0: LABEL_0
136
+ 1: LABEL_1
137
+ is_decoder: false
138
+ is_encoder_decoder: false
139
+ label2id:
140
+ LABEL_0: 0
141
+ LABEL_1: 1
142
+ length_penalty: 1.0
143
+ load_bf16: false
144
+ max_length: 20
145
+ min_length: 0
146
+ model_name: nvidia/Eagle-Block2A-2B-v2
147
+ no_repeat_ngram_size: 0
148
+ num_beam_groups: 1
149
+ num_beams: 1
150
+ num_return_sequences: 1
151
+ output_attentions: false
152
+ output_hidden_states: false
153
+ output_scores: false
154
+ pad_token_id: null
155
+ prefix: null
156
+ problem_type: null
157
+ pruned_heads: {}
158
+ random_rotation_angle: null
159
+ remove_invalid_values: false
160
+ repetition_penalty: 1.0
161
+ reproject_vision: false
162
+ return_dict: true
163
+ return_dict_in_generate: false
164
+ sep_token_id: null
165
+ state_dropout_prob: 0.0
166
+ suppress_tokens: null
167
+ task_specific_params: null
168
+ temperature: 1.0
169
+ tf_legacy_loss: false
170
+ tie_encoder_decoder: false
171
+ tie_word_embeddings: true
172
+ tokenizer_class: null
173
+ top_k: 50
174
+ top_p: 1.0
175
+ torch_dtype: null
176
+ torchscript: false
177
+ transformers_version: null
178
+ tune_diffusion_model: true
179
+ tune_llm: false
180
+ tune_projector: true
181
+ tune_visual: false
182
+ typical_p: 1.0
183
+ use_bfloat16: false
184
+ use_relative_action: true
185
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
186
+ add_rl_callback: false
187
+ assert_loss_less_than: null
188
+ batch_size: null
189
+ bf16: true
190
+ dataloader_num_workers: 2
191
+ ddp_bucket_cap_mb: 100
192
+ deepspeed_stage: 2
193
+ enable_open_loop_eval: false
194
+ enable_profiling: false
195
+ eval_batch_size: 2
196
+ eval_bf16: true
197
+ eval_set_split_ratio: 0.1
198
+ eval_steps: 500
199
+ eval_strategy: 'no'
200
+ experiment_name: null
201
+ fp16: false
202
+ global_batch_size: 32
203
+ gradient_accumulation_steps: 1
204
+ gradient_checkpointing: false
205
+ learning_rate: 0.0001
206
+ logging_steps: 10
207
+ lr_scheduler_type: cosine
208
+ max_concurrent_uploads: 2
209
+ max_grad_norm: 1.0
210
+ max_retries: 3
211
+ max_steps: 20000
212
+ num_gpus: 1
213
+ open_loop_eval_plot_indices: null
214
+ open_loop_eval_steps_per_traj: 100
215
+ open_loop_eval_traj_ids:
216
+ - 0
217
+ optim: adamw_torch
218
+ output_dir: ./outputs/fr5_cherry
219
+ remove_unused_columns: false
220
+ save_best_eval_metric_greater_is_better: true
221
+ save_best_eval_metric_name: ''
222
+ save_steps: 5000
223
+ save_total_limit: 5
224
+ save_vl_model: false
225
+ start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
226
+ tf32: true
227
+ transformers_access_token: null
228
+ transformers_cache_dir: null
229
+ transformers_local_files_only: false
230
+ transformers_trust_remote_code: true
231
+ upload_checkpoints: false
232
+ upload_every: 1000
233
+ upload_last_n_checkpoints: 5
234
+ use_ddp: false
235
+ use_wandb: false
236
+ wandb_project: finetune-gr00t-n1d6
237
+ warmup_ratio: 0.05
238
+ warmup_steps: 0
239
+ weight_decay: 1.0e-05
checkpoint-10000/experiment_cfg/dataset_statistics.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "state": {
4
+ "arm": {
5
+ "min": [
6
+ 0.4818978011608124,
7
+ -1.687173843383789,
8
+ 0.62826007604599,
9
+ -2.6761701107025146,
10
+ -1.8431425094604492,
11
+ -0.5678880214691162
12
+ ],
13
+ "max": [
14
+ 0.7935351133346558,
15
+ -1.014952301979065,
16
+ 1.8637524843215942,
17
+ -1.0820374488830566,
18
+ -1.4455490112304688,
19
+ 0.3115537762641907
20
+ ],
21
+ "mean": [
22
+ 0.6489784717559814,
23
+ -1.3269319534301758,
24
+ 1.356391429901123,
25
+ -1.804563045501709,
26
+ -1.619696021080017,
27
+ -0.07974076271057129
28
+ ],
29
+ "std": [
30
+ 0.053538445383310186,
31
+ 0.1604488044977188,
32
+ 0.2438623011112213,
33
+ 0.22075510025024414,
34
+ 0.07333532720804177,
35
+ 0.10092151165008545
36
+ ],
37
+ "q01": [
38
+ 0.5197953635454178,
39
+ -1.6432996988296509,
40
+ 0.8626433879137039,
41
+ -2.5542680168151857,
42
+ -1.8000394713878631,
43
+ -0.37301090329885483
44
+ ],
45
+ "q99": [
46
+ 0.7509180748462676,
47
+ -1.0879072868824005,
48
+ 1.7959050333499906,
49
+ -1.252977850437165,
50
+ -1.4705305182933812,
51
+ 0.2933953133225437
52
+ ]
53
+ },
54
+ "gripper": {
55
+ "min": [
56
+ 0.0
57
+ ],
58
+ "max": [
59
+ 1.0
60
+ ],
61
+ "mean": [
62
+ 0.7650123238563538
63
+ ],
64
+ "std": [
65
+ 0.39907386898994446
66
+ ],
67
+ "q01": [
68
+ 0.0
69
+ ],
70
+ "q99": [
71
+ 1.0
72
+ ]
73
+ }
74
+ },
75
+ "action": {
76
+ "arm": {
77
+ "min": [
78
+ 0.4818978011608124,
79
+ -1.687173843383789,
80
+ 0.62826007604599,
81
+ -2.6573522090911865,
82
+ -1.8431425094604492,
83
+ -0.5678880214691162
84
+ ],
85
+ "max": [
86
+ 0.7935351133346558,
87
+ -1.014952301979065,
88
+ 1.8637524843215942,
89
+ -1.0820374488830566,
90
+ -1.4455490112304688,
91
+ 0.3115537762641907
92
+ ],
93
+ "mean": [
94
+ 0.6489997506141663,
95
+ -1.326717495918274,
96
+ 1.355955958366394,
97
+ -1.8026670217514038,
98
+ -1.6199865341186523,
99
+ -0.07982920855283737
100
+ ],
101
+ "std": [
102
+ 0.05358240380883204,
103
+ 0.16021256148815155,
104
+ 0.243374302983284,
105
+ 0.2178075611591339,
106
+ 0.07321629673242531,
107
+ 0.10097639262676239
108
+ ],
109
+ "q01": [
110
+ 0.5197953635454178,
111
+ -1.6432996988296509,
112
+ 0.8625765931606293,
113
+ -2.53433034658432,
114
+ -1.8000823378562927,
115
+ -0.37301090329885483
116
+ ],
117
+ "q99": [
118
+ 0.7509180748462676,
119
+ -1.0879072868824005,
120
+ 1.7849992513656616,
121
+ -1.2526323044300085,
122
+ -1.4705633461475374,
123
+ 0.2933953133225437
124
+ ]
125
+ },
126
+ "gripper": {
127
+ "min": [
128
+ 0.0
129
+ ],
130
+ "max": [
131
+ 1.0
132
+ ],
133
+ "mean": [
134
+ 0.7650123238563538
135
+ ],
136
+ "std": [
137
+ 0.39907386898994446
138
+ ],
139
+ "q01": [
140
+ 0.0
141
+ ],
142
+ "q99": [
143
+ 1.0
144
+ ]
145
+ }
146
+ },
147
+ "relative_action": {}
148
+ }
149
+ }
checkpoint-10000/experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d6",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
+ "backbone_model_type": "eagle",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 4,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": true,
15
+ "collator_overwrite_image_inputs": false,
16
+ "eagle_collator": true,
17
+ "backbone_trainable_params_fp32": true,
18
+ "extra_augmentation_config": null,
19
+ "apply_sincos_state_encoding": true,
20
+ "use_relative_action": true,
21
+ "max_state_dim": 128,
22
+ "max_action_dim": 128,
23
+ "action_horizon": 50,
24
+ "hidden_size": 1024,
25
+ "input_embedding_dim": 1536,
26
+ "add_pos_embed": true,
27
+ "attn_dropout": 0.2,
28
+ "use_vlln": true,
29
+ "max_seq_len": 1024,
30
+ "use_alternate_vl_dit": true,
31
+ "attend_text_every_n_blocks": 2,
32
+ "diffusion_model_cfg": {
33
+ "attention_head_dim": 48,
34
+ "dropout": 0.2,
35
+ "final_dropout": true,
36
+ "interleave_self_attention": true,
37
+ "norm_type": "ada_norm",
38
+ "num_attention_heads": 32,
39
+ "num_layers": 32,
40
+ "output_dim": 1024,
41
+ "positional_embeddings": null
42
+ },
43
+ "num_inference_timesteps": 4,
44
+ "noise_beta_alpha": 1.5,
45
+ "noise_beta_beta": 1.0,
46
+ "noise_s": 0.999,
47
+ "num_timestep_buckets": 1000,
48
+ "tune_projector": true,
49
+ "tune_diffusion_model": true,
50
+ "tune_vlln": true,
51
+ "state_dropout_prob": 0.0,
52
+ "state_additive_noise_scale": 0.0,
53
+ "max_num_embodiments": 32
54
+ }
checkpoint-10000/experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a22551925e5bb41c48ebb3cd8533566607a4e966f51d34097f53fbe937a91659
3
+ size 4990120184
checkpoint-10000/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01a8b9f9243a6606b220e9ecfdf9c90271caecbf6a218d878817c602c0be7a3e
3
+ size 4823190320
checkpoint-10000/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1daa28f457b70c4bbcb6ef25875f17b75f9c587aee1da4f54bc2f8a7177d9af
3
+ size 12960193762
checkpoint-10000/processor_config.json ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "action_configs": null
18
+ },
19
+ "state": {
20
+ "delta_indices": [
21
+ 0
22
+ ],
23
+ "modality_keys": [
24
+ "robot_pos",
25
+ "robot_ori_cos",
26
+ "robot_ori_sin",
27
+ "robot_2d_ori",
28
+ "robot_2d_ori_cos",
29
+ "robot_2d_ori_sin",
30
+ "robot_lin_vel",
31
+ "robot_ang_vel",
32
+ "arm_left_qpos",
33
+ "arm_left_qpos_sin",
34
+ "arm_left_qpos_cos",
35
+ "eef_left_pos",
36
+ "eef_left_quat",
37
+ "gripper_left_qpos",
38
+ "arm_right_qpos",
39
+ "arm_right_qpos_sin",
40
+ "arm_right_qpos_cos",
41
+ "eef_right_pos",
42
+ "eef_right_quat",
43
+ "gripper_right_qpos",
44
+ "trunk_qpos"
45
+ ],
46
+ "sin_cos_embedding_keys": null,
47
+ "mean_std_embedding_keys": null,
48
+ "action_configs": null
49
+ },
50
+ "action": {
51
+ "delta_indices": [
52
+ 0,
53
+ 1,
54
+ 2,
55
+ 3,
56
+ 4,
57
+ 5,
58
+ 6,
59
+ 7,
60
+ 8,
61
+ 9,
62
+ 10,
63
+ 11,
64
+ 12,
65
+ 13,
66
+ 14,
67
+ 15,
68
+ 16,
69
+ 17,
70
+ 18,
71
+ 19,
72
+ 20,
73
+ 21,
74
+ 22,
75
+ 23,
76
+ 24,
77
+ 25,
78
+ 26,
79
+ 27,
80
+ 28,
81
+ 29,
82
+ 30,
83
+ 31
84
+ ],
85
+ "modality_keys": [
86
+ "base",
87
+ "torso",
88
+ "left_arm",
89
+ "left_gripper",
90
+ "right_arm",
91
+ "right_gripper"
92
+ ],
93
+ "sin_cos_embedding_keys": null,
94
+ "mean_std_embedding_keys": null,
95
+ "action_configs": [
96
+ {
97
+ "rep": "ABSOLUTE",
98
+ "type": "NON_EEF",
99
+ "format": "DEFAULT",
100
+ "state_key": null
101
+ },
102
+ {
103
+ "rep": "RELATIVE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": "trunk_qpos"
107
+ },
108
+ {
109
+ "rep": "RELATIVE",
110
+ "type": "NON_EEF",
111
+ "format": "DEFAULT",
112
+ "state_key": "arm_left_qpos"
113
+ },
114
+ {
115
+ "rep": "ABSOLUTE",
116
+ "type": "NON_EEF",
117
+ "format": "DEFAULT",
118
+ "state_key": null
119
+ },
120
+ {
121
+ "rep": "RELATIVE",
122
+ "type": "NON_EEF",
123
+ "format": "DEFAULT",
124
+ "state_key": "arm_right_qpos"
125
+ },
126
+ {
127
+ "rep": "ABSOLUTE",
128
+ "type": "NON_EEF",
129
+ "format": "DEFAULT",
130
+ "state_key": null
131
+ }
132
+ ]
133
+ },
134
+ "language": {
135
+ "delta_indices": [
136
+ 0
137
+ ],
138
+ "modality_keys": [
139
+ "annotation.human.coarse_action"
140
+ ],
141
+ "sin_cos_embedding_keys": null,
142
+ "mean_std_embedding_keys": null,
143
+ "action_configs": null
144
+ }
145
+ },
146
+ "gr1": {
147
+ "video": {
148
+ "delta_indices": [
149
+ 0
150
+ ],
151
+ "modality_keys": [
152
+ "ego_view_bg_crop_pad_res256_freq20"
153
+ ],
154
+ "sin_cos_embedding_keys": null,
155
+ "mean_std_embedding_keys": null,
156
+ "action_configs": null
157
+ },
158
+ "state": {
159
+ "delta_indices": [
160
+ 0
161
+ ],
162
+ "modality_keys": [
163
+ "left_arm",
164
+ "right_arm",
165
+ "left_hand",
166
+ "right_hand",
167
+ "waist"
168
+ ],
169
+ "sin_cos_embedding_keys": [
170
+ "left_arm",
171
+ "right_arm",
172
+ "left_hand",
173
+ "right_hand",
174
+ "waist"
175
+ ],
176
+ "mean_std_embedding_keys": null,
177
+ "action_configs": null
178
+ },
179
+ "action": {
180
+ "delta_indices": [
181
+ 0,
182
+ 1,
183
+ 2,
184
+ 3,
185
+ 4,
186
+ 5,
187
+ 6,
188
+ 7,
189
+ 8,
190
+ 9,
191
+ 10,
192
+ 11,
193
+ 12,
194
+ 13,
195
+ 14,
196
+ 15
197
+ ],
198
+ "modality_keys": [
199
+ "left_arm",
200
+ "right_arm",
201
+ "left_hand",
202
+ "right_hand",
203
+ "waist"
204
+ ],
205
+ "sin_cos_embedding_keys": null,
206
+ "mean_std_embedding_keys": null,
207
+ "action_configs": [
208
+ {
209
+ "rep": "RELATIVE",
210
+ "type": "NON_EEF",
211
+ "format": "DEFAULT",
212
+ "state_key": null
213
+ },
214
+ {
215
+ "rep": "RELATIVE",
216
+ "type": "NON_EEF",
217
+ "format": "DEFAULT",
218
+ "state_key": null
219
+ },
220
+ {
221
+ "rep": "RELATIVE",
222
+ "type": "NON_EEF",
223
+ "format": "DEFAULT",
224
+ "state_key": null
225
+ },
226
+ {
227
+ "rep": "RELATIVE",
228
+ "type": "NON_EEF",
229
+ "format": "DEFAULT",
230
+ "state_key": null
231
+ },
232
+ {
233
+ "rep": "ABSOLUTE",
234
+ "type": "NON_EEF",
235
+ "format": "DEFAULT",
236
+ "state_key": null
237
+ }
238
+ ]
239
+ },
240
+ "language": {
241
+ "delta_indices": [
242
+ 0
243
+ ],
244
+ "modality_keys": [
245
+ "task"
246
+ ],
247
+ "sin_cos_embedding_keys": null,
248
+ "mean_std_embedding_keys": null,
249
+ "action_configs": null
250
+ }
251
+ },
252
+ "robocasa_panda_omron": {
253
+ "video": {
254
+ "delta_indices": [
255
+ 0
256
+ ],
257
+ "modality_keys": [
258
+ "res256_image_side_0",
259
+ "res256_image_side_1",
260
+ "res256_image_wrist_0"
261
+ ],
262
+ "sin_cos_embedding_keys": null,
263
+ "mean_std_embedding_keys": null,
264
+ "action_configs": null
265
+ },
266
+ "state": {
267
+ "delta_indices": [
268
+ 0
269
+ ],
270
+ "modality_keys": [
271
+ "end_effector_position_relative",
272
+ "end_effector_rotation_relative",
273
+ "gripper_qpos",
274
+ "base_position",
275
+ "base_rotation"
276
+ ],
277
+ "sin_cos_embedding_keys": null,
278
+ "mean_std_embedding_keys": null,
279
+ "action_configs": null
280
+ },
281
+ "action": {
282
+ "delta_indices": [
283
+ 0,
284
+ 1,
285
+ 2,
286
+ 3,
287
+ 4,
288
+ 5,
289
+ 6,
290
+ 7,
291
+ 8,
292
+ 9,
293
+ 10,
294
+ 11,
295
+ 12,
296
+ 13,
297
+ 14,
298
+ 15
299
+ ],
300
+ "modality_keys": [
301
+ "end_effector_position",
302
+ "end_effector_rotation",
303
+ "gripper_close",
304
+ "base_motion",
305
+ "control_mode"
306
+ ],
307
+ "sin_cos_embedding_keys": null,
308
+ "mean_std_embedding_keys": null,
309
+ "action_configs": [
310
+ {
311
+ "rep": "ABSOLUTE",
312
+ "type": "NON_EEF",
313
+ "format": "DEFAULT",
314
+ "state_key": null
315
+ },
316
+ {
317
+ "rep": "ABSOLUTE",
318
+ "type": "NON_EEF",
319
+ "format": "DEFAULT",
320
+ "state_key": null
321
+ },
322
+ {
323
+ "rep": "ABSOLUTE",
324
+ "type": "NON_EEF",
325
+ "format": "DEFAULT",
326
+ "state_key": null
327
+ },
328
+ {
329
+ "rep": "ABSOLUTE",
330
+ "type": "NON_EEF",
331
+ "format": "DEFAULT",
332
+ "state_key": null
333
+ },
334
+ {
335
+ "rep": "ABSOLUTE",
336
+ "type": "NON_EEF",
337
+ "format": "DEFAULT",
338
+ "state_key": null
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "annotation.human.action.task_description"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "action_configs": null
352
+ }
353
+ },
354
+ "new_embodiment": {
355
+ "video": {
356
+ "delta_indices": [
357
+ 0
358
+ ],
359
+ "modality_keys": [
360
+ "cam_base",
361
+ "cam_wrist"
362
+ ],
363
+ "sin_cos_embedding_keys": null,
364
+ "mean_std_embedding_keys": null,
365
+ "action_configs": null
366
+ },
367
+ "state": {
368
+ "delta_indices": [
369
+ 0
370
+ ],
371
+ "modality_keys": [
372
+ "arm",
373
+ "gripper"
374
+ ],
375
+ "sin_cos_embedding_keys": null,
376
+ "mean_std_embedding_keys": null,
377
+ "action_configs": null
378
+ },
379
+ "action": {
380
+ "delta_indices": [
381
+ 0,
382
+ 1,
383
+ 2,
384
+ 3,
385
+ 4,
386
+ 5,
387
+ 6,
388
+ 7,
389
+ 8,
390
+ 9,
391
+ 10,
392
+ 11,
393
+ 12,
394
+ 13,
395
+ 14,
396
+ 15
397
+ ],
398
+ "modality_keys": [
399
+ "arm",
400
+ "gripper"
401
+ ],
402
+ "sin_cos_embedding_keys": null,
403
+ "mean_std_embedding_keys": null,
404
+ "action_configs": [
405
+ {
406
+ "rep": "ABSOLUTE",
407
+ "type": "NON_EEF",
408
+ "format": "DEFAULT",
409
+ "state_key": null
410
+ },
411
+ {
412
+ "rep": "ABSOLUTE",
413
+ "type": "NON_EEF",
414
+ "format": "DEFAULT",
415
+ "state_key": null
416
+ }
417
+ ]
418
+ },
419
+ "language": {
420
+ "delta_indices": [
421
+ 0
422
+ ],
423
+ "modality_keys": [
424
+ "annotation.human.task_description"
425
+ ],
426
+ "sin_cos_embedding_keys": null,
427
+ "mean_std_embedding_keys": null,
428
+ "action_configs": null
429
+ }
430
+ }
431
+ },
432
+ "image_crop_size": null,
433
+ "image_target_size": null,
434
+ "use_albumentations": true,
435
+ "random_rotation_angle": null,
436
+ "color_jitter_params": {
437
+ "brightness": 0.3,
438
+ "contrast": 0.4,
439
+ "saturation": 0.5,
440
+ "hue": 0.08
441
+ },
442
+ "shortest_image_edge": 256,
443
+ "crop_fraction": 0.95,
444
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
445
+ "model_type": "eagle",
446
+ "formalize_language": true,
447
+ "max_state_dim": 128,
448
+ "max_action_dim": 128,
449
+ "max_action_horizon": 50,
450
+ "use_percentiles": false,
451
+ "clip_outliers": true,
452
+ "apply_sincos_state_encoding": true,
453
+ "use_relative_action": true
454
+ }
455
+ }
checkpoint-10000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f5ea38e8fe73fc84868ec3e6011e3571a59e9595fe7e6de70a21b520b40798f
3
+ size 14645
checkpoint-10000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9793ce798c508501a83c7de96591494637b8ee20bbad166324a3caeddb4cdc8e
3
+ size 1465
checkpoint-10000/statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49cf1bb9160fba423ccdd2dae7d9b40228772dd15a3fb17d44a4fd07f7818f45
3
+ size 5713
checkpoint-10000/wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d6", "run_id": "fr5_cherry"}
checkpoint-15000/config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": true,
5
+ "architectures": [
6
+ "Gr00tN1d6"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_model_type": "eagle",
12
+ "backbone_trainable_params_fp32": true,
13
+ "collator_overwrite_image_inputs": false,
14
+ "color_jitter_params": {
15
+ "brightness": 0.1,
16
+ "contrast": 0.1,
17
+ "hue": 0.1,
18
+ "saturation": 0.1
19
+ },
20
+ "crop_fraction": 0.95,
21
+ "diffusion_model_cfg": {
22
+ "attention_head_dim": 48,
23
+ "dropout": 0.2,
24
+ "final_dropout": true,
25
+ "interleave_self_attention": true,
26
+ "norm_type": "ada_norm",
27
+ "num_attention_heads": 32,
28
+ "num_layers": 32,
29
+ "output_dim": 1024,
30
+ "positional_embeddings": null
31
+ },
32
+ "eagle_collator": true,
33
+ "formalize_language": true,
34
+ "gemma_collator": false,
35
+ "hidden_size": 1024,
36
+ "image_crop_size": null,
37
+ "image_target_size": null,
38
+ "input_embedding_dim": 1536,
39
+ "load_bf16": true,
40
+ "max_action_dim": 128,
41
+ "max_num_embodiments": 32,
42
+ "max_seq_len": 1024,
43
+ "max_state_dim": 128,
44
+ "model_dtype": "bfloat16",
45
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
+ "model_type": "Gr00tN1d6",
47
+ "noise_beta_alpha": 1.5,
48
+ "noise_beta_beta": 1.0,
49
+ "noise_s": 0.999,
50
+ "num_inference_timesteps": 4,
51
+ "num_timestep_buckets": 1000,
52
+ "random_rotation_angle": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 16,
55
+ "shortest_image_edge": 256,
56
+ "state_dropout_prob": 0.0,
57
+ "torch_dtype": "bfloat16",
58
+ "transformers_version": "4.51.3",
59
+ "tune_diffusion_model": true,
60
+ "tune_llm": false,
61
+ "tune_projector": true,
62
+ "tune_top_llm_layers": 4,
63
+ "tune_visual": false,
64
+ "tune_vlln": true,
65
+ "use_albumentations_transforms": true,
66
+ "use_alternate_vl_dit": true,
67
+ "use_flash_attention": true,
68
+ "use_relative_action": true,
69
+ "use_vlln": true
70
+ }
checkpoint-15000/embodiment_id.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "oxe_droid": 16,
10
+ "new_embodiment": 10
11
+ }
checkpoint-15000/experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d6
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Eagle-Block2A-2B-v2
6
+ backbone_model_type: eagle
7
+ model_revision: null
8
+ tune_top_llm_layers: 4
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 16
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ collator_overwrite_image_inputs: false
17
+ eagle_collator: true
18
+ backbone_trainable_params_fp32: true
19
+ image_crop_size: null
20
+ image_target_size: null
21
+ shortest_image_edge: 256
22
+ crop_fraction: 0.95
23
+ random_rotation_angle: null
24
+ color_jitter_params: null
25
+ use_albumentations_transforms: true
26
+ extra_augmentation_config: null
27
+ formalize_language: true
28
+ apply_sincos_state_encoding: false
29
+ use_relative_action: true
30
+ max_state_dim: 29
31
+ max_action_dim: 29
32
+ action_horizon: 16
33
+ hidden_size: 1024
34
+ input_embedding_dim: 1536
35
+ add_pos_embed: true
36
+ attn_dropout: 0.2
37
+ use_vlln: true
38
+ max_seq_len: 1024
39
+ use_alternate_vl_dit: true
40
+ attend_text_every_n_blocks: 2
41
+ diffusion_model_cfg:
42
+ positional_embeddings: null
43
+ num_layers: 32
44
+ num_attention_heads: 32
45
+ attention_head_dim: 48
46
+ norm_type: ada_norm
47
+ dropout: 0.2
48
+ final_dropout: true
49
+ output_dim: 1024
50
+ interleave_self_attention: true
51
+ num_inference_timesteps: 4
52
+ noise_beta_alpha: 1.5
53
+ noise_beta_beta: 1.0
54
+ noise_s: 0.999
55
+ num_timestep_buckets: 1000
56
+ tune_projector: true
57
+ tune_diffusion_model: true
58
+ tune_vlln: true
59
+ state_dropout_prob: 0.0
60
+ state_additive_noise_scale: 0.0
61
+ max_num_embodiments: 32
62
+ data:
63
+ datasets:
64
+ - dataset_paths:
65
+ - ./cherry_data
66
+ embodiment_tag: new_embodiment
67
+ mix_ratio: 1.0
68
+ dataset_type: physical_embodiment
69
+ val_dataset_path: null
70
+ modality_configs:
71
+ new_embodiment:
72
+ video:
73
+ delta_indices:
74
+ - 0
75
+ modality_keys:
76
+ - cam_base
77
+ - cam_wrist
78
+ sin_cos_embedding_keys: null
79
+ mean_std_embedding_keys: null
80
+ action_configs: null
81
+ state:
82
+ delta_indices:
83
+ - 0
84
+ modality_keys:
85
+ - arm
86
+ - gripper
87
+ sin_cos_embedding_keys: null
88
+ mean_std_embedding_keys: null
89
+ action_configs: null
90
+ action:
91
+ delta_indices:
92
+ - 0
93
+ - 1
94
+ - 2
95
+ - 3
96
+ - 4
97
+ - 5
98
+ - 6
99
+ - 7
100
+ - 8
101
+ - 9
102
+ - 10
103
+ - 11
104
+ - 12
105
+ - 13
106
+ - 14
107
+ - 15
108
+ modality_keys:
109
+ - arm
110
+ - gripper
111
+ sin_cos_embedding_keys: null
112
+ mean_std_embedding_keys: null
113
+ action_configs:
114
+ - rep: ABSOLUTE
115
+ type: NON_EEF
116
+ format: DEFAULT
117
+ state_key: null
118
+ - rep: ABSOLUTE
119
+ type: NON_EEF
120
+ format: DEFAULT
121
+ state_key: null
122
+ language:
123
+ delta_indices:
124
+ - 0
125
+ modality_keys:
126
+ - annotation.human.task_description
127
+ sin_cos_embedding_keys: null
128
+ mean_std_embedding_keys: null
129
+ action_configs: null
130
+ download_cache: false
131
+ shard_size: 1024
132
+ episode_sampling_rate: 0.1
133
+ num_shards_per_epoch: 100000
134
+ override_pretraining_statistics: false
135
+ mode: single_turn
136
+ random_chop: 0.0
137
+ mock_dataset_mode: false
138
+ shuffle: true
139
+ seed: 42
140
+ multiprocessing_context: fork
141
+ allow_padding: false
142
+ subsample_ratio: 1.0
143
+ image_crop_size:
144
+ - 244
145
+ - 244
146
+ image_target_size:
147
+ - 224
148
+ - 224
149
+ video_backend: decord
150
+ training:
151
+ output_dir: ./outputs/fr5_cherry
152
+ experiment_name: null
153
+ max_steps: 20000
154
+ global_batch_size: 32
155
+ batch_size: null
156
+ gradient_accumulation_steps: 1
157
+ learning_rate: 0.0001
158
+ lr_scheduler_type: cosine
159
+ weight_decay: 1.0e-05
160
+ warmup_ratio: 0.05
161
+ warmup_steps: 0
162
+ max_grad_norm: 1.0
163
+ optim: adamw_torch
164
+ start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
165
+ tf32: true
166
+ fp16: false
167
+ bf16: true
168
+ eval_bf16: true
169
+ logging_steps: 10
170
+ save_steps: 5000
171
+ save_total_limit: 5
172
+ save_vl_model: false
173
+ upload_checkpoints: false
174
+ upload_every: 1000
175
+ upload_last_n_checkpoints: 5
176
+ max_concurrent_uploads: 2
177
+ eval_strategy: 'no'
178
+ eval_steps: 500
179
+ eval_set_split_ratio: 0.1
180
+ eval_batch_size: 2
181
+ save_best_eval_metric_name: ''
182
+ save_best_eval_metric_greater_is_better: true
183
+ deepspeed_stage: 2
184
+ gradient_checkpointing: false
185
+ transformers_trust_remote_code: true
186
+ transformers_local_files_only: false
187
+ transformers_cache_dir: null
188
+ transformers_access_token: null
189
+ use_ddp: false
190
+ ddp_bucket_cap_mb: 100
191
+ num_gpus: 1
192
+ dataloader_num_workers: 2
193
+ remove_unused_columns: false
194
+ use_wandb: false
195
+ wandb_project: finetune-gr00t-n1d6
196
+ enable_profiling: false
197
+ max_retries: 3
198
+ assert_loss_less_than: null
199
+ add_rl_callback: false
200
+ enable_open_loop_eval: false
201
+ open_loop_eval_traj_ids:
202
+ - 0
203
+ open_loop_eval_steps_per_traj: 100
204
+ open_loop_eval_plot_indices: null
205
+ max_steps: 20000
206
+ save_steps: 5000
checkpoint-15000/experiment_cfg/config.yaml ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - ./cherry_data
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: new_embodiment
10
+ mix_ratio: 1.0
11
+ val_dataset_path: null
12
+ download_cache: false
13
+ episode_sampling_rate: 0.1
14
+ image_crop_size:
15
+ - 244
16
+ - 244
17
+ image_target_size:
18
+ - 224
19
+ - 224
20
+ mock_dataset_mode: false
21
+ modality_configs:
22
+ new_embodiment:
23
+ action: !!python/object:gr00t.data.types.ModalityConfig
24
+ action_configs:
25
+ - !!python/object:gr00t.data.types.ActionConfig
26
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
27
+ - default
28
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
29
+ - absolute
30
+ state_key: null
31
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
32
+ - non_eef
33
+ - !!python/object:gr00t.data.types.ActionConfig
34
+ format: *id001
35
+ rep: *id002
36
+ state_key: null
37
+ type: *id003
38
+ delta_indices:
39
+ - 0
40
+ - 1
41
+ - 2
42
+ - 3
43
+ - 4
44
+ - 5
45
+ - 6
46
+ - 7
47
+ - 8
48
+ - 9
49
+ - 10
50
+ - 11
51
+ - 12
52
+ - 13
53
+ - 14
54
+ - 15
55
+ mean_std_embedding_keys: null
56
+ modality_keys:
57
+ - arm
58
+ - gripper
59
+ sin_cos_embedding_keys: null
60
+ language: !!python/object:gr00t.data.types.ModalityConfig
61
+ action_configs: null
62
+ delta_indices:
63
+ - 0
64
+ mean_std_embedding_keys: null
65
+ modality_keys:
66
+ - annotation.human.task_description
67
+ sin_cos_embedding_keys: null
68
+ state: !!python/object:gr00t.data.types.ModalityConfig
69
+ action_configs: null
70
+ delta_indices:
71
+ - 0
72
+ mean_std_embedding_keys: null
73
+ modality_keys:
74
+ - arm
75
+ - gripper
76
+ sin_cos_embedding_keys: null
77
+ video: !!python/object:gr00t.data.types.ModalityConfig
78
+ action_configs: null
79
+ delta_indices:
80
+ - 0
81
+ mean_std_embedding_keys: null
82
+ modality_keys:
83
+ - cam_base
84
+ - cam_wrist
85
+ sin_cos_embedding_keys: null
86
+ mode: single_turn
87
+ multiprocessing_context: fork
88
+ num_shards_per_epoch: 100000
89
+ override_pretraining_statistics: false
90
+ random_chop: 0.0
91
+ seed: 42
92
+ shard_size: 1024
93
+ shuffle: true
94
+ subsample_ratio: 1.0
95
+ video_backend: decord
96
+ load_config_path: null
97
+ model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
98
+ _attn_implementation_autoset: false
99
+ _attn_implementation_internal: null
100
+ _commit_hash: null
101
+ _name_or_path: ''
102
+ add_cross_attention: false
103
+ architectures: null
104
+ backbone_model_type: eagle
105
+ backbone_trainable_params_fp32: true
106
+ bad_words_ids: null
107
+ begin_suppress_tokens: null
108
+ bos_token_id: null
109
+ chunk_size_feed_forward: 0
110
+ color_jitter_params: null
111
+ cross_attention_hidden_size: null
112
+ decoder_start_token_id: null
113
+ diffusion_model_cfg:
114
+ attention_head_dim: 48
115
+ dropout: 0.2
116
+ final_dropout: true
117
+ interleave_self_attention: true
118
+ norm_type: ada_norm
119
+ num_attention_heads: 32
120
+ num_layers: 32
121
+ output_dim: 1024
122
+ positional_embeddings: null
123
+ diversity_penalty: 0.0
124
+ do_sample: false
125
+ eagle_collator: true
126
+ early_stopping: false
127
+ encoder_no_repeat_ngram_size: 0
128
+ eos_token_id: null
129
+ exponential_decay_length_penalty: null
130
+ extra_augmentation_config: null
131
+ finetuning_task: null
132
+ forced_bos_token_id: null
133
+ forced_eos_token_id: null
134
+ id2label:
135
+ 0: LABEL_0
136
+ 1: LABEL_1
137
+ is_decoder: false
138
+ is_encoder_decoder: false
139
+ label2id:
140
+ LABEL_0: 0
141
+ LABEL_1: 1
142
+ length_penalty: 1.0
143
+ load_bf16: false
144
+ max_length: 20
145
+ min_length: 0
146
+ model_name: nvidia/Eagle-Block2A-2B-v2
147
+ no_repeat_ngram_size: 0
148
+ num_beam_groups: 1
149
+ num_beams: 1
150
+ num_return_sequences: 1
151
+ output_attentions: false
152
+ output_hidden_states: false
153
+ output_scores: false
154
+ pad_token_id: null
155
+ prefix: null
156
+ problem_type: null
157
+ pruned_heads: {}
158
+ random_rotation_angle: null
159
+ remove_invalid_values: false
160
+ repetition_penalty: 1.0
161
+ reproject_vision: false
162
+ return_dict: true
163
+ return_dict_in_generate: false
164
+ sep_token_id: null
165
+ state_dropout_prob: 0.0
166
+ suppress_tokens: null
167
+ task_specific_params: null
168
+ temperature: 1.0
169
+ tf_legacy_loss: false
170
+ tie_encoder_decoder: false
171
+ tie_word_embeddings: true
172
+ tokenizer_class: null
173
+ top_k: 50
174
+ top_p: 1.0
175
+ torch_dtype: null
176
+ torchscript: false
177
+ transformers_version: null
178
+ tune_diffusion_model: true
179
+ tune_llm: false
180
+ tune_projector: true
181
+ tune_visual: false
182
+ typical_p: 1.0
183
+ use_bfloat16: false
184
+ use_relative_action: true
185
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
186
+ add_rl_callback: false
187
+ assert_loss_less_than: null
188
+ batch_size: null
189
+ bf16: true
190
+ dataloader_num_workers: 2
191
+ ddp_bucket_cap_mb: 100
192
+ deepspeed_stage: 2
193
+ enable_open_loop_eval: false
194
+ enable_profiling: false
195
+ eval_batch_size: 2
196
+ eval_bf16: true
197
+ eval_set_split_ratio: 0.1
198
+ eval_steps: 500
199
+ eval_strategy: 'no'
200
+ experiment_name: null
201
+ fp16: false
202
+ global_batch_size: 32
203
+ gradient_accumulation_steps: 1
204
+ gradient_checkpointing: false
205
+ learning_rate: 0.0001
206
+ logging_steps: 10
207
+ lr_scheduler_type: cosine
208
+ max_concurrent_uploads: 2
209
+ max_grad_norm: 1.0
210
+ max_retries: 3
211
+ max_steps: 20000
212
+ num_gpus: 1
213
+ open_loop_eval_plot_indices: null
214
+ open_loop_eval_steps_per_traj: 100
215
+ open_loop_eval_traj_ids:
216
+ - 0
217
+ optim: adamw_torch
218
+ output_dir: ./outputs/fr5_cherry
219
+ remove_unused_columns: false
220
+ save_best_eval_metric_greater_is_better: true
221
+ save_best_eval_metric_name: ''
222
+ save_steps: 5000
223
+ save_total_limit: 5
224
+ save_vl_model: false
225
+ start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
226
+ tf32: true
227
+ transformers_access_token: null
228
+ transformers_cache_dir: null
229
+ transformers_local_files_only: false
230
+ transformers_trust_remote_code: true
231
+ upload_checkpoints: false
232
+ upload_every: 1000
233
+ upload_last_n_checkpoints: 5
234
+ use_ddp: false
235
+ use_wandb: false
236
+ wandb_project: finetune-gr00t-n1d6
237
+ warmup_ratio: 0.05
238
+ warmup_steps: 0
239
+ weight_decay: 1.0e-05
checkpoint-15000/experiment_cfg/dataset_statistics.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "state": {
4
+ "arm": {
5
+ "min": [
6
+ 0.4818978011608124,
7
+ -1.687173843383789,
8
+ 0.62826007604599,
9
+ -2.6761701107025146,
10
+ -1.8431425094604492,
11
+ -0.5678880214691162
12
+ ],
13
+ "max": [
14
+ 0.7935351133346558,
15
+ -1.014952301979065,
16
+ 1.8637524843215942,
17
+ -1.0820374488830566,
18
+ -1.4455490112304688,
19
+ 0.3115537762641907
20
+ ],
21
+ "mean": [
22
+ 0.6489784717559814,
23
+ -1.3269319534301758,
24
+ 1.356391429901123,
25
+ -1.804563045501709,
26
+ -1.619696021080017,
27
+ -0.07974076271057129
28
+ ],
29
+ "std": [
30
+ 0.053538445383310186,
31
+ 0.1604488044977188,
32
+ 0.2438623011112213,
33
+ 0.22075510025024414,
34
+ 0.07333532720804177,
35
+ 0.10092151165008545
36
+ ],
37
+ "q01": [
38
+ 0.5197953635454178,
39
+ -1.6432996988296509,
40
+ 0.8626433879137039,
41
+ -2.5542680168151857,
42
+ -1.8000394713878631,
43
+ -0.37301090329885483
44
+ ],
45
+ "q99": [
46
+ 0.7509180748462676,
47
+ -1.0879072868824005,
48
+ 1.7959050333499906,
49
+ -1.252977850437165,
50
+ -1.4705305182933812,
51
+ 0.2933953133225437
52
+ ]
53
+ },
54
+ "gripper": {
55
+ "min": [
56
+ 0.0
57
+ ],
58
+ "max": [
59
+ 1.0
60
+ ],
61
+ "mean": [
62
+ 0.7650123238563538
63
+ ],
64
+ "std": [
65
+ 0.39907386898994446
66
+ ],
67
+ "q01": [
68
+ 0.0
69
+ ],
70
+ "q99": [
71
+ 1.0
72
+ ]
73
+ }
74
+ },
75
+ "action": {
76
+ "arm": {
77
+ "min": [
78
+ 0.4818978011608124,
79
+ -1.687173843383789,
80
+ 0.62826007604599,
81
+ -2.6573522090911865,
82
+ -1.8431425094604492,
83
+ -0.5678880214691162
84
+ ],
85
+ "max": [
86
+ 0.7935351133346558,
87
+ -1.014952301979065,
88
+ 1.8637524843215942,
89
+ -1.0820374488830566,
90
+ -1.4455490112304688,
91
+ 0.3115537762641907
92
+ ],
93
+ "mean": [
94
+ 0.6489997506141663,
95
+ -1.326717495918274,
96
+ 1.355955958366394,
97
+ -1.8026670217514038,
98
+ -1.6199865341186523,
99
+ -0.07982920855283737
100
+ ],
101
+ "std": [
102
+ 0.05358240380883204,
103
+ 0.16021256148815155,
104
+ 0.243374302983284,
105
+ 0.2178075611591339,
106
+ 0.07321629673242531,
107
+ 0.10097639262676239
108
+ ],
109
+ "q01": [
110
+ 0.5197953635454178,
111
+ -1.6432996988296509,
112
+ 0.8625765931606293,
113
+ -2.53433034658432,
114
+ -1.8000823378562927,
115
+ -0.37301090329885483
116
+ ],
117
+ "q99": [
118
+ 0.7509180748462676,
119
+ -1.0879072868824005,
120
+ 1.7849992513656616,
121
+ -1.2526323044300085,
122
+ -1.4705633461475374,
123
+ 0.2933953133225437
124
+ ]
125
+ },
126
+ "gripper": {
127
+ "min": [
128
+ 0.0
129
+ ],
130
+ "max": [
131
+ 1.0
132
+ ],
133
+ "mean": [
134
+ 0.7650123238563538
135
+ ],
136
+ "std": [
137
+ 0.39907386898994446
138
+ ],
139
+ "q01": [
140
+ 0.0
141
+ ],
142
+ "q99": [
143
+ 1.0
144
+ ]
145
+ }
146
+ },
147
+ "relative_action": {}
148
+ }
149
+ }
checkpoint-15000/experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d6",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
+ "backbone_model_type": "eagle",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 4,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": true,
15
+ "collator_overwrite_image_inputs": false,
16
+ "eagle_collator": true,
17
+ "backbone_trainable_params_fp32": true,
18
+ "extra_augmentation_config": null,
19
+ "apply_sincos_state_encoding": true,
20
+ "use_relative_action": true,
21
+ "max_state_dim": 128,
22
+ "max_action_dim": 128,
23
+ "action_horizon": 50,
24
+ "hidden_size": 1024,
25
+ "input_embedding_dim": 1536,
26
+ "add_pos_embed": true,
27
+ "attn_dropout": 0.2,
28
+ "use_vlln": true,
29
+ "max_seq_len": 1024,
30
+ "use_alternate_vl_dit": true,
31
+ "attend_text_every_n_blocks": 2,
32
+ "diffusion_model_cfg": {
33
+ "attention_head_dim": 48,
34
+ "dropout": 0.2,
35
+ "final_dropout": true,
36
+ "interleave_self_attention": true,
37
+ "norm_type": "ada_norm",
38
+ "num_attention_heads": 32,
39
+ "num_layers": 32,
40
+ "output_dim": 1024,
41
+ "positional_embeddings": null
42
+ },
43
+ "num_inference_timesteps": 4,
44
+ "noise_beta_alpha": 1.5,
45
+ "noise_beta_beta": 1.0,
46
+ "noise_s": 0.999,
47
+ "num_timestep_buckets": 1000,
48
+ "tune_projector": true,
49
+ "tune_diffusion_model": true,
50
+ "tune_vlln": true,
51
+ "state_dropout_prob": 0.0,
52
+ "state_additive_noise_scale": 0.0,
53
+ "max_num_embodiments": 32
54
+ }
checkpoint-15000/experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-15000/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b2b93b15f33bd8ea70ac7f6b11aa60c56788947162d6ae00dd1987457f54da3
3
+ size 4990120184
checkpoint-15000/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9be8fde5f5ace47b89e78e8cbea6a0b5494abb99ac6350affb66f53c7fc7c68
3
+ size 4823190320
checkpoint-15000/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-15000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:018072403772e39d83a35fb933559eaf88901e875bf632858002d7b78a84a344
3
+ size 12960193762
checkpoint-15000/processor_config.json ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "action_configs": null
18
+ },
19
+ "state": {
20
+ "delta_indices": [
21
+ 0
22
+ ],
23
+ "modality_keys": [
24
+ "robot_pos",
25
+ "robot_ori_cos",
26
+ "robot_ori_sin",
27
+ "robot_2d_ori",
28
+ "robot_2d_ori_cos",
29
+ "robot_2d_ori_sin",
30
+ "robot_lin_vel",
31
+ "robot_ang_vel",
32
+ "arm_left_qpos",
33
+ "arm_left_qpos_sin",
34
+ "arm_left_qpos_cos",
35
+ "eef_left_pos",
36
+ "eef_left_quat",
37
+ "gripper_left_qpos",
38
+ "arm_right_qpos",
39
+ "arm_right_qpos_sin",
40
+ "arm_right_qpos_cos",
41
+ "eef_right_pos",
42
+ "eef_right_quat",
43
+ "gripper_right_qpos",
44
+ "trunk_qpos"
45
+ ],
46
+ "sin_cos_embedding_keys": null,
47
+ "mean_std_embedding_keys": null,
48
+ "action_configs": null
49
+ },
50
+ "action": {
51
+ "delta_indices": [
52
+ 0,
53
+ 1,
54
+ 2,
55
+ 3,
56
+ 4,
57
+ 5,
58
+ 6,
59
+ 7,
60
+ 8,
61
+ 9,
62
+ 10,
63
+ 11,
64
+ 12,
65
+ 13,
66
+ 14,
67
+ 15,
68
+ 16,
69
+ 17,
70
+ 18,
71
+ 19,
72
+ 20,
73
+ 21,
74
+ 22,
75
+ 23,
76
+ 24,
77
+ 25,
78
+ 26,
79
+ 27,
80
+ 28,
81
+ 29,
82
+ 30,
83
+ 31
84
+ ],
85
+ "modality_keys": [
86
+ "base",
87
+ "torso",
88
+ "left_arm",
89
+ "left_gripper",
90
+ "right_arm",
91
+ "right_gripper"
92
+ ],
93
+ "sin_cos_embedding_keys": null,
94
+ "mean_std_embedding_keys": null,
95
+ "action_configs": [
96
+ {
97
+ "rep": "ABSOLUTE",
98
+ "type": "NON_EEF",
99
+ "format": "DEFAULT",
100
+ "state_key": null
101
+ },
102
+ {
103
+ "rep": "RELATIVE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": "trunk_qpos"
107
+ },
108
+ {
109
+ "rep": "RELATIVE",
110
+ "type": "NON_EEF",
111
+ "format": "DEFAULT",
112
+ "state_key": "arm_left_qpos"
113
+ },
114
+ {
115
+ "rep": "ABSOLUTE",
116
+ "type": "NON_EEF",
117
+ "format": "DEFAULT",
118
+ "state_key": null
119
+ },
120
+ {
121
+ "rep": "RELATIVE",
122
+ "type": "NON_EEF",
123
+ "format": "DEFAULT",
124
+ "state_key": "arm_right_qpos"
125
+ },
126
+ {
127
+ "rep": "ABSOLUTE",
128
+ "type": "NON_EEF",
129
+ "format": "DEFAULT",
130
+ "state_key": null
131
+ }
132
+ ]
133
+ },
134
+ "language": {
135
+ "delta_indices": [
136
+ 0
137
+ ],
138
+ "modality_keys": [
139
+ "annotation.human.coarse_action"
140
+ ],
141
+ "sin_cos_embedding_keys": null,
142
+ "mean_std_embedding_keys": null,
143
+ "action_configs": null
144
+ }
145
+ },
146
+ "gr1": {
147
+ "video": {
148
+ "delta_indices": [
149
+ 0
150
+ ],
151
+ "modality_keys": [
152
+ "ego_view_bg_crop_pad_res256_freq20"
153
+ ],
154
+ "sin_cos_embedding_keys": null,
155
+ "mean_std_embedding_keys": null,
156
+ "action_configs": null
157
+ },
158
+ "state": {
159
+ "delta_indices": [
160
+ 0
161
+ ],
162
+ "modality_keys": [
163
+ "left_arm",
164
+ "right_arm",
165
+ "left_hand",
166
+ "right_hand",
167
+ "waist"
168
+ ],
169
+ "sin_cos_embedding_keys": [
170
+ "left_arm",
171
+ "right_arm",
172
+ "left_hand",
173
+ "right_hand",
174
+ "waist"
175
+ ],
176
+ "mean_std_embedding_keys": null,
177
+ "action_configs": null
178
+ },
179
+ "action": {
180
+ "delta_indices": [
181
+ 0,
182
+ 1,
183
+ 2,
184
+ 3,
185
+ 4,
186
+ 5,
187
+ 6,
188
+ 7,
189
+ 8,
190
+ 9,
191
+ 10,
192
+ 11,
193
+ 12,
194
+ 13,
195
+ 14,
196
+ 15
197
+ ],
198
+ "modality_keys": [
199
+ "left_arm",
200
+ "right_arm",
201
+ "left_hand",
202
+ "right_hand",
203
+ "waist"
204
+ ],
205
+ "sin_cos_embedding_keys": null,
206
+ "mean_std_embedding_keys": null,
207
+ "action_configs": [
208
+ {
209
+ "rep": "RELATIVE",
210
+ "type": "NON_EEF",
211
+ "format": "DEFAULT",
212
+ "state_key": null
213
+ },
214
+ {
215
+ "rep": "RELATIVE",
216
+ "type": "NON_EEF",
217
+ "format": "DEFAULT",
218
+ "state_key": null
219
+ },
220
+ {
221
+ "rep": "RELATIVE",
222
+ "type": "NON_EEF",
223
+ "format": "DEFAULT",
224
+ "state_key": null
225
+ },
226
+ {
227
+ "rep": "RELATIVE",
228
+ "type": "NON_EEF",
229
+ "format": "DEFAULT",
230
+ "state_key": null
231
+ },
232
+ {
233
+ "rep": "ABSOLUTE",
234
+ "type": "NON_EEF",
235
+ "format": "DEFAULT",
236
+ "state_key": null
237
+ }
238
+ ]
239
+ },
240
+ "language": {
241
+ "delta_indices": [
242
+ 0
243
+ ],
244
+ "modality_keys": [
245
+ "task"
246
+ ],
247
+ "sin_cos_embedding_keys": null,
248
+ "mean_std_embedding_keys": null,
249
+ "action_configs": null
250
+ }
251
+ },
252
+ "robocasa_panda_omron": {
253
+ "video": {
254
+ "delta_indices": [
255
+ 0
256
+ ],
257
+ "modality_keys": [
258
+ "res256_image_side_0",
259
+ "res256_image_side_1",
260
+ "res256_image_wrist_0"
261
+ ],
262
+ "sin_cos_embedding_keys": null,
263
+ "mean_std_embedding_keys": null,
264
+ "action_configs": null
265
+ },
266
+ "state": {
267
+ "delta_indices": [
268
+ 0
269
+ ],
270
+ "modality_keys": [
271
+ "end_effector_position_relative",
272
+ "end_effector_rotation_relative",
273
+ "gripper_qpos",
274
+ "base_position",
275
+ "base_rotation"
276
+ ],
277
+ "sin_cos_embedding_keys": null,
278
+ "mean_std_embedding_keys": null,
279
+ "action_configs": null
280
+ },
281
+ "action": {
282
+ "delta_indices": [
283
+ 0,
284
+ 1,
285
+ 2,
286
+ 3,
287
+ 4,
288
+ 5,
289
+ 6,
290
+ 7,
291
+ 8,
292
+ 9,
293
+ 10,
294
+ 11,
295
+ 12,
296
+ 13,
297
+ 14,
298
+ 15
299
+ ],
300
+ "modality_keys": [
301
+ "end_effector_position",
302
+ "end_effector_rotation",
303
+ "gripper_close",
304
+ "base_motion",
305
+ "control_mode"
306
+ ],
307
+ "sin_cos_embedding_keys": null,
308
+ "mean_std_embedding_keys": null,
309
+ "action_configs": [
310
+ {
311
+ "rep": "ABSOLUTE",
312
+ "type": "NON_EEF",
313
+ "format": "DEFAULT",
314
+ "state_key": null
315
+ },
316
+ {
317
+ "rep": "ABSOLUTE",
318
+ "type": "NON_EEF",
319
+ "format": "DEFAULT",
320
+ "state_key": null
321
+ },
322
+ {
323
+ "rep": "ABSOLUTE",
324
+ "type": "NON_EEF",
325
+ "format": "DEFAULT",
326
+ "state_key": null
327
+ },
328
+ {
329
+ "rep": "ABSOLUTE",
330
+ "type": "NON_EEF",
331
+ "format": "DEFAULT",
332
+ "state_key": null
333
+ },
334
+ {
335
+ "rep": "ABSOLUTE",
336
+ "type": "NON_EEF",
337
+ "format": "DEFAULT",
338
+ "state_key": null
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "annotation.human.action.task_description"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "action_configs": null
352
+ }
353
+ },
354
+ "new_embodiment": {
355
+ "video": {
356
+ "delta_indices": [
357
+ 0
358
+ ],
359
+ "modality_keys": [
360
+ "cam_base",
361
+ "cam_wrist"
362
+ ],
363
+ "sin_cos_embedding_keys": null,
364
+ "mean_std_embedding_keys": null,
365
+ "action_configs": null
366
+ },
367
+ "state": {
368
+ "delta_indices": [
369
+ 0
370
+ ],
371
+ "modality_keys": [
372
+ "arm",
373
+ "gripper"
374
+ ],
375
+ "sin_cos_embedding_keys": null,
376
+ "mean_std_embedding_keys": null,
377
+ "action_configs": null
378
+ },
379
+ "action": {
380
+ "delta_indices": [
381
+ 0,
382
+ 1,
383
+ 2,
384
+ 3,
385
+ 4,
386
+ 5,
387
+ 6,
388
+ 7,
389
+ 8,
390
+ 9,
391
+ 10,
392
+ 11,
393
+ 12,
394
+ 13,
395
+ 14,
396
+ 15
397
+ ],
398
+ "modality_keys": [
399
+ "arm",
400
+ "gripper"
401
+ ],
402
+ "sin_cos_embedding_keys": null,
403
+ "mean_std_embedding_keys": null,
404
+ "action_configs": [
405
+ {
406
+ "rep": "ABSOLUTE",
407
+ "type": "NON_EEF",
408
+ "format": "DEFAULT",
409
+ "state_key": null
410
+ },
411
+ {
412
+ "rep": "ABSOLUTE",
413
+ "type": "NON_EEF",
414
+ "format": "DEFAULT",
415
+ "state_key": null
416
+ }
417
+ ]
418
+ },
419
+ "language": {
420
+ "delta_indices": [
421
+ 0
422
+ ],
423
+ "modality_keys": [
424
+ "annotation.human.task_description"
425
+ ],
426
+ "sin_cos_embedding_keys": null,
427
+ "mean_std_embedding_keys": null,
428
+ "action_configs": null
429
+ }
430
+ }
431
+ },
432
+ "image_crop_size": null,
433
+ "image_target_size": null,
434
+ "use_albumentations": true,
435
+ "random_rotation_angle": null,
436
+ "color_jitter_params": {
437
+ "brightness": 0.3,
438
+ "contrast": 0.4,
439
+ "saturation": 0.5,
440
+ "hue": 0.08
441
+ },
442
+ "shortest_image_edge": 256,
443
+ "crop_fraction": 0.95,
444
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
445
+ "model_type": "eagle",
446
+ "formalize_language": true,
447
+ "max_state_dim": 128,
448
+ "max_action_dim": 128,
449
+ "max_action_horizon": 50,
450
+ "use_percentiles": false,
451
+ "clip_outliers": true,
452
+ "apply_sincos_state_encoding": true,
453
+ "use_relative_action": true
454
+ }
455
+ }
checkpoint-15000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:292427e64b7e1ca74d7fc55bdd4f7612064cd77b7dcb6cf568f0be95dfd5152b
3
+ size 14645
checkpoint-15000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee0ce25fd589eb0fbcffab63bcc2e2a86e5fb56630601b06b6bc1e425539b706
3
+ size 1465
checkpoint-15000/statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-15000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-15000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49cf1bb9160fba423ccdd2dae7d9b40228772dd15a3fb17d44a4fd07f7818f45
3
+ size 5713
checkpoint-15000/wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d6", "run_id": "fr5_cherry"}
checkpoint-20000/config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": true,
5
+ "architectures": [
6
+ "Gr00tN1d6"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_model_type": "eagle",
12
+ "backbone_trainable_params_fp32": true,
13
+ "collator_overwrite_image_inputs": false,
14
+ "color_jitter_params": {
15
+ "brightness": 0.1,
16
+ "contrast": 0.1,
17
+ "hue": 0.1,
18
+ "saturation": 0.1
19
+ },
20
+ "crop_fraction": 0.95,
21
+ "diffusion_model_cfg": {
22
+ "attention_head_dim": 48,
23
+ "dropout": 0.2,
24
+ "final_dropout": true,
25
+ "interleave_self_attention": true,
26
+ "norm_type": "ada_norm",
27
+ "num_attention_heads": 32,
28
+ "num_layers": 32,
29
+ "output_dim": 1024,
30
+ "positional_embeddings": null
31
+ },
32
+ "eagle_collator": true,
33
+ "formalize_language": true,
34
+ "gemma_collator": false,
35
+ "hidden_size": 1024,
36
+ "image_crop_size": null,
37
+ "image_target_size": null,
38
+ "input_embedding_dim": 1536,
39
+ "load_bf16": true,
40
+ "max_action_dim": 128,
41
+ "max_num_embodiments": 32,
42
+ "max_seq_len": 1024,
43
+ "max_state_dim": 128,
44
+ "model_dtype": "bfloat16",
45
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
+ "model_type": "Gr00tN1d6",
47
+ "noise_beta_alpha": 1.5,
48
+ "noise_beta_beta": 1.0,
49
+ "noise_s": 0.999,
50
+ "num_inference_timesteps": 4,
51
+ "num_timestep_buckets": 1000,
52
+ "random_rotation_angle": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 16,
55
+ "shortest_image_edge": 256,
56
+ "state_dropout_prob": 0.0,
57
+ "torch_dtype": "bfloat16",
58
+ "transformers_version": "4.51.3",
59
+ "tune_diffusion_model": true,
60
+ "tune_llm": false,
61
+ "tune_projector": true,
62
+ "tune_top_llm_layers": 4,
63
+ "tune_visual": false,
64
+ "tune_vlln": true,
65
+ "use_albumentations_transforms": true,
66
+ "use_alternate_vl_dit": true,
67
+ "use_flash_attention": true,
68
+ "use_relative_action": true,
69
+ "use_vlln": true
70
+ }
checkpoint-20000/embodiment_id.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "oxe_droid": 16,
10
+ "new_embodiment": 10
11
+ }
checkpoint-20000/experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d6
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Eagle-Block2A-2B-v2
6
+ backbone_model_type: eagle
7
+ model_revision: null
8
+ tune_top_llm_layers: 4
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 16
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ collator_overwrite_image_inputs: false
17
+ eagle_collator: true
18
+ backbone_trainable_params_fp32: true
19
+ image_crop_size: null
20
+ image_target_size: null
21
+ shortest_image_edge: 256
22
+ crop_fraction: 0.95
23
+ random_rotation_angle: null
24
+ color_jitter_params: null
25
+ use_albumentations_transforms: true
26
+ extra_augmentation_config: null
27
+ formalize_language: true
28
+ apply_sincos_state_encoding: false
29
+ use_relative_action: true
30
+ max_state_dim: 29
31
+ max_action_dim: 29
32
+ action_horizon: 16
33
+ hidden_size: 1024
34
+ input_embedding_dim: 1536
35
+ add_pos_embed: true
36
+ attn_dropout: 0.2
37
+ use_vlln: true
38
+ max_seq_len: 1024
39
+ use_alternate_vl_dit: true
40
+ attend_text_every_n_blocks: 2
41
+ diffusion_model_cfg:
42
+ positional_embeddings: null
43
+ num_layers: 32
44
+ num_attention_heads: 32
45
+ attention_head_dim: 48
46
+ norm_type: ada_norm
47
+ dropout: 0.2
48
+ final_dropout: true
49
+ output_dim: 1024
50
+ interleave_self_attention: true
51
+ num_inference_timesteps: 4
52
+ noise_beta_alpha: 1.5
53
+ noise_beta_beta: 1.0
54
+ noise_s: 0.999
55
+ num_timestep_buckets: 1000
56
+ tune_projector: true
57
+ tune_diffusion_model: true
58
+ tune_vlln: true
59
+ state_dropout_prob: 0.0
60
+ state_additive_noise_scale: 0.0
61
+ max_num_embodiments: 32
62
+ data:
63
+ datasets:
64
+ - dataset_paths:
65
+ - ./cherry_data
66
+ embodiment_tag: new_embodiment
67
+ mix_ratio: 1.0
68
+ dataset_type: physical_embodiment
69
+ val_dataset_path: null
70
+ modality_configs:
71
+ new_embodiment:
72
+ video:
73
+ delta_indices:
74
+ - 0
75
+ modality_keys:
76
+ - cam_base
77
+ - cam_wrist
78
+ sin_cos_embedding_keys: null
79
+ mean_std_embedding_keys: null
80
+ action_configs: null
81
+ state:
82
+ delta_indices:
83
+ - 0
84
+ modality_keys:
85
+ - arm
86
+ - gripper
87
+ sin_cos_embedding_keys: null
88
+ mean_std_embedding_keys: null
89
+ action_configs: null
90
+ action:
91
+ delta_indices:
92
+ - 0
93
+ - 1
94
+ - 2
95
+ - 3
96
+ - 4
97
+ - 5
98
+ - 6
99
+ - 7
100
+ - 8
101
+ - 9
102
+ - 10
103
+ - 11
104
+ - 12
105
+ - 13
106
+ - 14
107
+ - 15
108
+ modality_keys:
109
+ - arm
110
+ - gripper
111
+ sin_cos_embedding_keys: null
112
+ mean_std_embedding_keys: null
113
+ action_configs:
114
+ - rep: ABSOLUTE
115
+ type: NON_EEF
116
+ format: DEFAULT
117
+ state_key: null
118
+ - rep: ABSOLUTE
119
+ type: NON_EEF
120
+ format: DEFAULT
121
+ state_key: null
122
+ language:
123
+ delta_indices:
124
+ - 0
125
+ modality_keys:
126
+ - annotation.human.task_description
127
+ sin_cos_embedding_keys: null
128
+ mean_std_embedding_keys: null
129
+ action_configs: null
130
+ download_cache: false
131
+ shard_size: 1024
132
+ episode_sampling_rate: 0.1
133
+ num_shards_per_epoch: 100000
134
+ override_pretraining_statistics: false
135
+ mode: single_turn
136
+ random_chop: 0.0
137
+ mock_dataset_mode: false
138
+ shuffle: true
139
+ seed: 42
140
+ multiprocessing_context: fork
141
+ allow_padding: false
142
+ subsample_ratio: 1.0
143
+ image_crop_size:
144
+ - 244
145
+ - 244
146
+ image_target_size:
147
+ - 224
148
+ - 224
149
+ video_backend: decord
150
+ training:
151
+ output_dir: ./outputs/fr5_cherry
152
+ experiment_name: null
153
+ max_steps: 20000
154
+ global_batch_size: 32
155
+ batch_size: null
156
+ gradient_accumulation_steps: 1
157
+ learning_rate: 0.0001
158
+ lr_scheduler_type: cosine
159
+ weight_decay: 1.0e-05
160
+ warmup_ratio: 0.05
161
+ warmup_steps: 0
162
+ max_grad_norm: 1.0
163
+ optim: adamw_torch
164
+ start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
165
+ tf32: true
166
+ fp16: false
167
+ bf16: true
168
+ eval_bf16: true
169
+ logging_steps: 10
170
+ save_steps: 5000
171
+ save_total_limit: 5
172
+ save_vl_model: false
173
+ upload_checkpoints: false
174
+ upload_every: 1000
175
+ upload_last_n_checkpoints: 5
176
+ max_concurrent_uploads: 2
177
+ eval_strategy: 'no'
178
+ eval_steps: 500
179
+ eval_set_split_ratio: 0.1
180
+ eval_batch_size: 2
181
+ save_best_eval_metric_name: ''
182
+ save_best_eval_metric_greater_is_better: true
183
+ deepspeed_stage: 2
184
+ gradient_checkpointing: false
185
+ transformers_trust_remote_code: true
186
+ transformers_local_files_only: false
187
+ transformers_cache_dir: null
188
+ transformers_access_token: null
189
+ use_ddp: false
190
+ ddp_bucket_cap_mb: 100
191
+ num_gpus: 1
192
+ dataloader_num_workers: 2
193
+ remove_unused_columns: false
194
+ use_wandb: false
195
+ wandb_project: finetune-gr00t-n1d6
196
+ enable_profiling: false
197
+ max_retries: 3
198
+ assert_loss_less_than: null
199
+ add_rl_callback: false
200
+ enable_open_loop_eval: false
201
+ open_loop_eval_traj_ids:
202
+ - 0
203
+ open_loop_eval_steps_per_traj: 100
204
+ open_loop_eval_plot_indices: null
205
+ max_steps: 20000
206
+ save_steps: 5000
checkpoint-20000/experiment_cfg/config.yaml ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - ./cherry_data
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: new_embodiment
10
+ mix_ratio: 1.0
11
+ val_dataset_path: null
12
+ download_cache: false
13
+ episode_sampling_rate: 0.1
14
+ image_crop_size:
15
+ - 244
16
+ - 244
17
+ image_target_size:
18
+ - 224
19
+ - 224
20
+ mock_dataset_mode: false
21
+ modality_configs:
22
+ new_embodiment:
23
+ action: !!python/object:gr00t.data.types.ModalityConfig
24
+ action_configs:
25
+ - !!python/object:gr00t.data.types.ActionConfig
26
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
27
+ - default
28
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
29
+ - absolute
30
+ state_key: null
31
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
32
+ - non_eef
33
+ - !!python/object:gr00t.data.types.ActionConfig
34
+ format: *id001
35
+ rep: *id002
36
+ state_key: null
37
+ type: *id003
38
+ delta_indices:
39
+ - 0
40
+ - 1
41
+ - 2
42
+ - 3
43
+ - 4
44
+ - 5
45
+ - 6
46
+ - 7
47
+ - 8
48
+ - 9
49
+ - 10
50
+ - 11
51
+ - 12
52
+ - 13
53
+ - 14
54
+ - 15
55
+ mean_std_embedding_keys: null
56
+ modality_keys:
57
+ - arm
58
+ - gripper
59
+ sin_cos_embedding_keys: null
60
+ language: !!python/object:gr00t.data.types.ModalityConfig
61
+ action_configs: null
62
+ delta_indices:
63
+ - 0
64
+ mean_std_embedding_keys: null
65
+ modality_keys:
66
+ - annotation.human.task_description
67
+ sin_cos_embedding_keys: null
68
+ state: !!python/object:gr00t.data.types.ModalityConfig
69
+ action_configs: null
70
+ delta_indices:
71
+ - 0
72
+ mean_std_embedding_keys: null
73
+ modality_keys:
74
+ - arm
75
+ - gripper
76
+ sin_cos_embedding_keys: null
77
+ video: !!python/object:gr00t.data.types.ModalityConfig
78
+ action_configs: null
79
+ delta_indices:
80
+ - 0
81
+ mean_std_embedding_keys: null
82
+ modality_keys:
83
+ - cam_base
84
+ - cam_wrist
85
+ sin_cos_embedding_keys: null
86
+ mode: single_turn
87
+ multiprocessing_context: fork
88
+ num_shards_per_epoch: 100000
89
+ override_pretraining_statistics: false
90
+ random_chop: 0.0
91
+ seed: 42
92
+ shard_size: 1024
93
+ shuffle: true
94
+ subsample_ratio: 1.0
95
+ video_backend: decord
96
+ load_config_path: null
97
+ model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
98
+ _attn_implementation_autoset: false
99
+ _attn_implementation_internal: null
100
+ _commit_hash: null
101
+ _name_or_path: ''
102
+ add_cross_attention: false
103
+ architectures: null
104
+ backbone_model_type: eagle
105
+ backbone_trainable_params_fp32: true
106
+ bad_words_ids: null
107
+ begin_suppress_tokens: null
108
+ bos_token_id: null
109
+ chunk_size_feed_forward: 0
110
+ color_jitter_params: null
111
+ cross_attention_hidden_size: null
112
+ decoder_start_token_id: null
113
+ diffusion_model_cfg:
114
+ attention_head_dim: 48
115
+ dropout: 0.2
116
+ final_dropout: true
117
+ interleave_self_attention: true
118
+ norm_type: ada_norm
119
+ num_attention_heads: 32
120
+ num_layers: 32
121
+ output_dim: 1024
122
+ positional_embeddings: null
123
+ diversity_penalty: 0.0
124
+ do_sample: false
125
+ eagle_collator: true
126
+ early_stopping: false
127
+ encoder_no_repeat_ngram_size: 0
128
+ eos_token_id: null
129
+ exponential_decay_length_penalty: null
130
+ extra_augmentation_config: null
131
+ finetuning_task: null
132
+ forced_bos_token_id: null
133
+ forced_eos_token_id: null
134
+ id2label:
135
+ 0: LABEL_0
136
+ 1: LABEL_1
137
+ is_decoder: false
138
+ is_encoder_decoder: false
139
+ label2id:
140
+ LABEL_0: 0
141
+ LABEL_1: 1
142
+ length_penalty: 1.0
143
+ load_bf16: false
144
+ max_length: 20
145
+ min_length: 0
146
+ model_name: nvidia/Eagle-Block2A-2B-v2
147
+ no_repeat_ngram_size: 0
148
+ num_beam_groups: 1
149
+ num_beams: 1
150
+ num_return_sequences: 1
151
+ output_attentions: false
152
+ output_hidden_states: false
153
+ output_scores: false
154
+ pad_token_id: null
155
+ prefix: null
156
+ problem_type: null
157
+ pruned_heads: {}
158
+ random_rotation_angle: null
159
+ remove_invalid_values: false
160
+ repetition_penalty: 1.0
161
+ reproject_vision: false
162
+ return_dict: true
163
+ return_dict_in_generate: false
164
+ sep_token_id: null
165
+ state_dropout_prob: 0.0
166
+ suppress_tokens: null
167
+ task_specific_params: null
168
+ temperature: 1.0
169
+ tf_legacy_loss: false
170
+ tie_encoder_decoder: false
171
+ tie_word_embeddings: true
172
+ tokenizer_class: null
173
+ top_k: 50
174
+ top_p: 1.0
175
+ torch_dtype: null
176
+ torchscript: false
177
+ transformers_version: null
178
+ tune_diffusion_model: true
179
+ tune_llm: false
180
+ tune_projector: true
181
+ tune_visual: false
182
+ typical_p: 1.0
183
+ use_bfloat16: false
184
+ use_relative_action: true
185
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
186
+ add_rl_callback: false
187
+ assert_loss_less_than: null
188
+ batch_size: null
189
+ bf16: true
190
+ dataloader_num_workers: 2
191
+ ddp_bucket_cap_mb: 100
192
+ deepspeed_stage: 2
193
+ enable_open_loop_eval: false
194
+ enable_profiling: false
195
+ eval_batch_size: 2
196
+ eval_bf16: true
197
+ eval_set_split_ratio: 0.1
198
+ eval_steps: 500
199
+ eval_strategy: 'no'
200
+ experiment_name: null
201
+ fp16: false
202
+ global_batch_size: 32
203
+ gradient_accumulation_steps: 1
204
+ gradient_checkpointing: false
205
+ learning_rate: 0.0001
206
+ logging_steps: 10
207
+ lr_scheduler_type: cosine
208
+ max_concurrent_uploads: 2
209
+ max_grad_norm: 1.0
210
+ max_retries: 3
211
+ max_steps: 20000
212
+ num_gpus: 1
213
+ open_loop_eval_plot_indices: null
214
+ open_loop_eval_steps_per_traj: 100
215
+ open_loop_eval_traj_ids:
216
+ - 0
217
+ optim: adamw_torch
218
+ output_dir: ./outputs/fr5_cherry
219
+ remove_unused_columns: false
220
+ save_best_eval_metric_greater_is_better: true
221
+ save_best_eval_metric_name: ''
222
+ save_steps: 5000
223
+ save_total_limit: 5
224
+ save_vl_model: false
225
+ start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
226
+ tf32: true
227
+ transformers_access_token: null
228
+ transformers_cache_dir: null
229
+ transformers_local_files_only: false
230
+ transformers_trust_remote_code: true
231
+ upload_checkpoints: false
232
+ upload_every: 1000
233
+ upload_last_n_checkpoints: 5
234
+ use_ddp: false
235
+ use_wandb: false
236
+ wandb_project: finetune-gr00t-n1d6
237
+ warmup_ratio: 0.05
238
+ warmup_steps: 0
239
+ weight_decay: 1.0e-05
checkpoint-20000/experiment_cfg/dataset_statistics.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "state": {
4
+ "arm": {
5
+ "min": [
6
+ 0.4818978011608124,
7
+ -1.687173843383789,
8
+ 0.62826007604599,
9
+ -2.6761701107025146,
10
+ -1.8431425094604492,
11
+ -0.5678880214691162
12
+ ],
13
+ "max": [
14
+ 0.7935351133346558,
15
+ -1.014952301979065,
16
+ 1.8637524843215942,
17
+ -1.0820374488830566,
18
+ -1.4455490112304688,
19
+ 0.3115537762641907
20
+ ],
21
+ "mean": [
22
+ 0.6489784717559814,
23
+ -1.3269319534301758,
24
+ 1.356391429901123,
25
+ -1.804563045501709,
26
+ -1.619696021080017,
27
+ -0.07974076271057129
28
+ ],
29
+ "std": [
30
+ 0.053538445383310186,
31
+ 0.1604488044977188,
32
+ 0.2438623011112213,
33
+ 0.22075510025024414,
34
+ 0.07333532720804177,
35
+ 0.10092151165008545
36
+ ],
37
+ "q01": [
38
+ 0.5197953635454178,
39
+ -1.6432996988296509,
40
+ 0.8626433879137039,
41
+ -2.5542680168151857,
42
+ -1.8000394713878631,
43
+ -0.37301090329885483
44
+ ],
45
+ "q99": [
46
+ 0.7509180748462676,
47
+ -1.0879072868824005,
48
+ 1.7959050333499906,
49
+ -1.252977850437165,
50
+ -1.4705305182933812,
51
+ 0.2933953133225437
52
+ ]
53
+ },
54
+ "gripper": {
55
+ "min": [
56
+ 0.0
57
+ ],
58
+ "max": [
59
+ 1.0
60
+ ],
61
+ "mean": [
62
+ 0.7650123238563538
63
+ ],
64
+ "std": [
65
+ 0.39907386898994446
66
+ ],
67
+ "q01": [
68
+ 0.0
69
+ ],
70
+ "q99": [
71
+ 1.0
72
+ ]
73
+ }
74
+ },
75
+ "action": {
76
+ "arm": {
77
+ "min": [
78
+ 0.4818978011608124,
79
+ -1.687173843383789,
80
+ 0.62826007604599,
81
+ -2.6573522090911865,
82
+ -1.8431425094604492,
83
+ -0.5678880214691162
84
+ ],
85
+ "max": [
86
+ 0.7935351133346558,
87
+ -1.014952301979065,
88
+ 1.8637524843215942,
89
+ -1.0820374488830566,
90
+ -1.4455490112304688,
91
+ 0.3115537762641907
92
+ ],
93
+ "mean": [
94
+ 0.6489997506141663,
95
+ -1.326717495918274,
96
+ 1.355955958366394,
97
+ -1.8026670217514038,
98
+ -1.6199865341186523,
99
+ -0.07982920855283737
100
+ ],
101
+ "std": [
102
+ 0.05358240380883204,
103
+ 0.16021256148815155,
104
+ 0.243374302983284,
105
+ 0.2178075611591339,
106
+ 0.07321629673242531,
107
+ 0.10097639262676239
108
+ ],
109
+ "q01": [
110
+ 0.5197953635454178,
111
+ -1.6432996988296509,
112
+ 0.8625765931606293,
113
+ -2.53433034658432,
114
+ -1.8000823378562927,
115
+ -0.37301090329885483
116
+ ],
117
+ "q99": [
118
+ 0.7509180748462676,
119
+ -1.0879072868824005,
120
+ 1.7849992513656616,
121
+ -1.2526323044300085,
122
+ -1.4705633461475374,
123
+ 0.2933953133225437
124
+ ]
125
+ },
126
+ "gripper": {
127
+ "min": [
128
+ 0.0
129
+ ],
130
+ "max": [
131
+ 1.0
132
+ ],
133
+ "mean": [
134
+ 0.7650123238563538
135
+ ],
136
+ "std": [
137
+ 0.39907386898994446
138
+ ],
139
+ "q01": [
140
+ 0.0
141
+ ],
142
+ "q99": [
143
+ 1.0
144
+ ]
145
+ }
146
+ },
147
+ "relative_action": {}
148
+ }
149
+ }
checkpoint-20000/experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d6",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
+ "backbone_model_type": "eagle",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 4,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": true,
15
+ "collator_overwrite_image_inputs": false,
16
+ "eagle_collator": true,
17
+ "backbone_trainable_params_fp32": true,
18
+ "extra_augmentation_config": null,
19
+ "apply_sincos_state_encoding": true,
20
+ "use_relative_action": true,
21
+ "max_state_dim": 128,
22
+ "max_action_dim": 128,
23
+ "action_horizon": 50,
24
+ "hidden_size": 1024,
25
+ "input_embedding_dim": 1536,
26
+ "add_pos_embed": true,
27
+ "attn_dropout": 0.2,
28
+ "use_vlln": true,
29
+ "max_seq_len": 1024,
30
+ "use_alternate_vl_dit": true,
31
+ "attend_text_every_n_blocks": 2,
32
+ "diffusion_model_cfg": {
33
+ "attention_head_dim": 48,
34
+ "dropout": 0.2,
35
+ "final_dropout": true,
36
+ "interleave_self_attention": true,
37
+ "norm_type": "ada_norm",
38
+ "num_attention_heads": 32,
39
+ "num_layers": 32,
40
+ "output_dim": 1024,
41
+ "positional_embeddings": null
42
+ },
43
+ "num_inference_timesteps": 4,
44
+ "noise_beta_alpha": 1.5,
45
+ "noise_beta_beta": 1.0,
46
+ "noise_s": 0.999,
47
+ "num_timestep_buckets": 1000,
48
+ "tune_projector": true,
49
+ "tune_diffusion_model": true,
50
+ "tune_vlln": true,
51
+ "state_dropout_prob": 0.0,
52
+ "state_additive_noise_scale": 0.0,
53
+ "max_num_embodiments": 32
54
+ }
checkpoint-20000/experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e567773ecb7739580072e23d2cf32c20483faa087d934138f85f3e9f717ef54c
3
+ size 4990120184
checkpoint-20000/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a82b8434c44a16ff46ed3ac65a9082e15ef327779100bf4e7ac0633f694dc1dc
3
+ size 4823190320
checkpoint-20000/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4808f6a3470dddc484424c6c3648b3b4b808e96ea886a0505ea84dcea913924
3
+ size 12960193762
checkpoint-20000/processor_config.json ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "action_configs": null
18
+ },
19
+ "state": {
20
+ "delta_indices": [
21
+ 0
22
+ ],
23
+ "modality_keys": [
24
+ "robot_pos",
25
+ "robot_ori_cos",
26
+ "robot_ori_sin",
27
+ "robot_2d_ori",
28
+ "robot_2d_ori_cos",
29
+ "robot_2d_ori_sin",
30
+ "robot_lin_vel",
31
+ "robot_ang_vel",
32
+ "arm_left_qpos",
33
+ "arm_left_qpos_sin",
34
+ "arm_left_qpos_cos",
35
+ "eef_left_pos",
36
+ "eef_left_quat",
37
+ "gripper_left_qpos",
38
+ "arm_right_qpos",
39
+ "arm_right_qpos_sin",
40
+ "arm_right_qpos_cos",
41
+ "eef_right_pos",
42
+ "eef_right_quat",
43
+ "gripper_right_qpos",
44
+ "trunk_qpos"
45
+ ],
46
+ "sin_cos_embedding_keys": null,
47
+ "mean_std_embedding_keys": null,
48
+ "action_configs": null
49
+ },
50
+ "action": {
51
+ "delta_indices": [
52
+ 0,
53
+ 1,
54
+ 2,
55
+ 3,
56
+ 4,
57
+ 5,
58
+ 6,
59
+ 7,
60
+ 8,
61
+ 9,
62
+ 10,
63
+ 11,
64
+ 12,
65
+ 13,
66
+ 14,
67
+ 15,
68
+ 16,
69
+ 17,
70
+ 18,
71
+ 19,
72
+ 20,
73
+ 21,
74
+ 22,
75
+ 23,
76
+ 24,
77
+ 25,
78
+ 26,
79
+ 27,
80
+ 28,
81
+ 29,
82
+ 30,
83
+ 31
84
+ ],
85
+ "modality_keys": [
86
+ "base",
87
+ "torso",
88
+ "left_arm",
89
+ "left_gripper",
90
+ "right_arm",
91
+ "right_gripper"
92
+ ],
93
+ "sin_cos_embedding_keys": null,
94
+ "mean_std_embedding_keys": null,
95
+ "action_configs": [
96
+ {
97
+ "rep": "ABSOLUTE",
98
+ "type": "NON_EEF",
99
+ "format": "DEFAULT",
100
+ "state_key": null
101
+ },
102
+ {
103
+ "rep": "RELATIVE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": "trunk_qpos"
107
+ },
108
+ {
109
+ "rep": "RELATIVE",
110
+ "type": "NON_EEF",
111
+ "format": "DEFAULT",
112
+ "state_key": "arm_left_qpos"
113
+ },
114
+ {
115
+ "rep": "ABSOLUTE",
116
+ "type": "NON_EEF",
117
+ "format": "DEFAULT",
118
+ "state_key": null
119
+ },
120
+ {
121
+ "rep": "RELATIVE",
122
+ "type": "NON_EEF",
123
+ "format": "DEFAULT",
124
+ "state_key": "arm_right_qpos"
125
+ },
126
+ {
127
+ "rep": "ABSOLUTE",
128
+ "type": "NON_EEF",
129
+ "format": "DEFAULT",
130
+ "state_key": null
131
+ }
132
+ ]
133
+ },
134
+ "language": {
135
+ "delta_indices": [
136
+ 0
137
+ ],
138
+ "modality_keys": [
139
+ "annotation.human.coarse_action"
140
+ ],
141
+ "sin_cos_embedding_keys": null,
142
+ "mean_std_embedding_keys": null,
143
+ "action_configs": null
144
+ }
145
+ },
146
+ "gr1": {
147
+ "video": {
148
+ "delta_indices": [
149
+ 0
150
+ ],
151
+ "modality_keys": [
152
+ "ego_view_bg_crop_pad_res256_freq20"
153
+ ],
154
+ "sin_cos_embedding_keys": null,
155
+ "mean_std_embedding_keys": null,
156
+ "action_configs": null
157
+ },
158
+ "state": {
159
+ "delta_indices": [
160
+ 0
161
+ ],
162
+ "modality_keys": [
163
+ "left_arm",
164
+ "right_arm",
165
+ "left_hand",
166
+ "right_hand",
167
+ "waist"
168
+ ],
169
+ "sin_cos_embedding_keys": [
170
+ "left_arm",
171
+ "right_arm",
172
+ "left_hand",
173
+ "right_hand",
174
+ "waist"
175
+ ],
176
+ "mean_std_embedding_keys": null,
177
+ "action_configs": null
178
+ },
179
+ "action": {
180
+ "delta_indices": [
181
+ 0,
182
+ 1,
183
+ 2,
184
+ 3,
185
+ 4,
186
+ 5,
187
+ 6,
188
+ 7,
189
+ 8,
190
+ 9,
191
+ 10,
192
+ 11,
193
+ 12,
194
+ 13,
195
+ 14,
196
+ 15
197
+ ],
198
+ "modality_keys": [
199
+ "left_arm",
200
+ "right_arm",
201
+ "left_hand",
202
+ "right_hand",
203
+ "waist"
204
+ ],
205
+ "sin_cos_embedding_keys": null,
206
+ "mean_std_embedding_keys": null,
207
+ "action_configs": [
208
+ {
209
+ "rep": "RELATIVE",
210
+ "type": "NON_EEF",
211
+ "format": "DEFAULT",
212
+ "state_key": null
213
+ },
214
+ {
215
+ "rep": "RELATIVE",
216
+ "type": "NON_EEF",
217
+ "format": "DEFAULT",
218
+ "state_key": null
219
+ },
220
+ {
221
+ "rep": "RELATIVE",
222
+ "type": "NON_EEF",
223
+ "format": "DEFAULT",
224
+ "state_key": null
225
+ },
226
+ {
227
+ "rep": "RELATIVE",
228
+ "type": "NON_EEF",
229
+ "format": "DEFAULT",
230
+ "state_key": null
231
+ },
232
+ {
233
+ "rep": "ABSOLUTE",
234
+ "type": "NON_EEF",
235
+ "format": "DEFAULT",
236
+ "state_key": null
237
+ }
238
+ ]
239
+ },
240
+ "language": {
241
+ "delta_indices": [
242
+ 0
243
+ ],
244
+ "modality_keys": [
245
+ "task"
246
+ ],
247
+ "sin_cos_embedding_keys": null,
248
+ "mean_std_embedding_keys": null,
249
+ "action_configs": null
250
+ }
251
+ },
252
+ "robocasa_panda_omron": {
253
+ "video": {
254
+ "delta_indices": [
255
+ 0
256
+ ],
257
+ "modality_keys": [
258
+ "res256_image_side_0",
259
+ "res256_image_side_1",
260
+ "res256_image_wrist_0"
261
+ ],
262
+ "sin_cos_embedding_keys": null,
263
+ "mean_std_embedding_keys": null,
264
+ "action_configs": null
265
+ },
266
+ "state": {
267
+ "delta_indices": [
268
+ 0
269
+ ],
270
+ "modality_keys": [
271
+ "end_effector_position_relative",
272
+ "end_effector_rotation_relative",
273
+ "gripper_qpos",
274
+ "base_position",
275
+ "base_rotation"
276
+ ],
277
+ "sin_cos_embedding_keys": null,
278
+ "mean_std_embedding_keys": null,
279
+ "action_configs": null
280
+ },
281
+ "action": {
282
+ "delta_indices": [
283
+ 0,
284
+ 1,
285
+ 2,
286
+ 3,
287
+ 4,
288
+ 5,
289
+ 6,
290
+ 7,
291
+ 8,
292
+ 9,
293
+ 10,
294
+ 11,
295
+ 12,
296
+ 13,
297
+ 14,
298
+ 15
299
+ ],
300
+ "modality_keys": [
301
+ "end_effector_position",
302
+ "end_effector_rotation",
303
+ "gripper_close",
304
+ "base_motion",
305
+ "control_mode"
306
+ ],
307
+ "sin_cos_embedding_keys": null,
308
+ "mean_std_embedding_keys": null,
309
+ "action_configs": [
310
+ {
311
+ "rep": "ABSOLUTE",
312
+ "type": "NON_EEF",
313
+ "format": "DEFAULT",
314
+ "state_key": null
315
+ },
316
+ {
317
+ "rep": "ABSOLUTE",
318
+ "type": "NON_EEF",
319
+ "format": "DEFAULT",
320
+ "state_key": null
321
+ },
322
+ {
323
+ "rep": "ABSOLUTE",
324
+ "type": "NON_EEF",
325
+ "format": "DEFAULT",
326
+ "state_key": null
327
+ },
328
+ {
329
+ "rep": "ABSOLUTE",
330
+ "type": "NON_EEF",
331
+ "format": "DEFAULT",
332
+ "state_key": null
333
+ },
334
+ {
335
+ "rep": "ABSOLUTE",
336
+ "type": "NON_EEF",
337
+ "format": "DEFAULT",
338
+ "state_key": null
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "annotation.human.action.task_description"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "action_configs": null
352
+ }
353
+ },
354
+ "new_embodiment": {
355
+ "video": {
356
+ "delta_indices": [
357
+ 0
358
+ ],
359
+ "modality_keys": [
360
+ "cam_base",
361
+ "cam_wrist"
362
+ ],
363
+ "sin_cos_embedding_keys": null,
364
+ "mean_std_embedding_keys": null,
365
+ "action_configs": null
366
+ },
367
+ "state": {
368
+ "delta_indices": [
369
+ 0
370
+ ],
371
+ "modality_keys": [
372
+ "arm",
373
+ "gripper"
374
+ ],
375
+ "sin_cos_embedding_keys": null,
376
+ "mean_std_embedding_keys": null,
377
+ "action_configs": null
378
+ },
379
+ "action": {
380
+ "delta_indices": [
381
+ 0,
382
+ 1,
383
+ 2,
384
+ 3,
385
+ 4,
386
+ 5,
387
+ 6,
388
+ 7,
389
+ 8,
390
+ 9,
391
+ 10,
392
+ 11,
393
+ 12,
394
+ 13,
395
+ 14,
396
+ 15
397
+ ],
398
+ "modality_keys": [
399
+ "arm",
400
+ "gripper"
401
+ ],
402
+ "sin_cos_embedding_keys": null,
403
+ "mean_std_embedding_keys": null,
404
+ "action_configs": [
405
+ {
406
+ "rep": "ABSOLUTE",
407
+ "type": "NON_EEF",
408
+ "format": "DEFAULT",
409
+ "state_key": null
410
+ },
411
+ {
412
+ "rep": "ABSOLUTE",
413
+ "type": "NON_EEF",
414
+ "format": "DEFAULT",
415
+ "state_key": null
416
+ }
417
+ ]
418
+ },
419
+ "language": {
420
+ "delta_indices": [
421
+ 0
422
+ ],
423
+ "modality_keys": [
424
+ "annotation.human.task_description"
425
+ ],
426
+ "sin_cos_embedding_keys": null,
427
+ "mean_std_embedding_keys": null,
428
+ "action_configs": null
429
+ }
430
+ }
431
+ },
432
+ "image_crop_size": null,
433
+ "image_target_size": null,
434
+ "use_albumentations": true,
435
+ "random_rotation_angle": null,
436
+ "color_jitter_params": {
437
+ "brightness": 0.3,
438
+ "contrast": 0.4,
439
+ "saturation": 0.5,
440
+ "hue": 0.08
441
+ },
442
+ "shortest_image_edge": 256,
443
+ "crop_fraction": 0.95,
444
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
445
+ "model_type": "eagle",
446
+ "formalize_language": true,
447
+ "max_state_dim": 128,
448
+ "max_action_dim": 128,
449
+ "max_action_horizon": 50,
450
+ "use_percentiles": false,
451
+ "clip_outliers": true,
452
+ "apply_sincos_state_encoding": true,
453
+ "use_relative_action": true
454
+ }
455
+ }
checkpoint-20000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:675751298697509e109188026494482b1da89c72d7a1ba3abec2e498516d2755
3
+ size 14645
checkpoint-20000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fa802a80def971b73ec74284a6aa44d0b2ea101bd38ed41a3b1c1a0b4001f00
3
+ size 1465