ry-5 commited on
Commit
88304e9
·
verified ·
1 Parent(s): 2a4e638

Remove intermediate checkpoints, keep final model only

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-10000/config.json +0 -70
  2. checkpoint-10000/embodiment_id.json +0 -11
  3. checkpoint-10000/experiment_cfg/conf.yaml +0 -206
  4. checkpoint-10000/experiment_cfg/config.yaml +0 -239
  5. checkpoint-10000/experiment_cfg/dataset_statistics.json +0 -149
  6. checkpoint-10000/experiment_cfg/final_model_config.json +0 -54
  7. checkpoint-10000/experiment_cfg/final_processor_config.json +0 -0
  8. checkpoint-10000/model-00001-of-00002.safetensors +0 -3
  9. checkpoint-10000/model-00002-of-00002.safetensors +0 -3
  10. checkpoint-10000/model.safetensors.index.json +0 -0
  11. checkpoint-10000/optimizer.pt +0 -3
  12. checkpoint-10000/processor_config.json +0 -455
  13. checkpoint-10000/rng_state.pth +0 -3
  14. checkpoint-10000/scheduler.pt +0 -3
  15. checkpoint-10000/statistics.json +0 -0
  16. checkpoint-10000/trainer_state.json +0 -0
  17. checkpoint-10000/training_args.bin +0 -3
  18. checkpoint-10000/wandb_config.json +0 -1
  19. checkpoint-15000/config.json +0 -70
  20. checkpoint-15000/embodiment_id.json +0 -11
  21. checkpoint-15000/experiment_cfg/conf.yaml +0 -206
  22. checkpoint-15000/experiment_cfg/config.yaml +0 -239
  23. checkpoint-15000/experiment_cfg/dataset_statistics.json +0 -149
  24. checkpoint-15000/experiment_cfg/final_model_config.json +0 -54
  25. checkpoint-15000/experiment_cfg/final_processor_config.json +0 -0
  26. checkpoint-15000/model-00001-of-00002.safetensors +0 -3
  27. checkpoint-15000/model-00002-of-00002.safetensors +0 -3
  28. checkpoint-15000/model.safetensors.index.json +0 -0
  29. checkpoint-15000/optimizer.pt +0 -3
  30. checkpoint-15000/processor_config.json +0 -455
  31. checkpoint-15000/rng_state.pth +0 -3
  32. checkpoint-15000/scheduler.pt +0 -3
  33. checkpoint-15000/statistics.json +0 -0
  34. checkpoint-15000/trainer_state.json +0 -0
  35. checkpoint-15000/training_args.bin +0 -3
  36. checkpoint-15000/wandb_config.json +0 -1
  37. checkpoint-20000/config.json +0 -70
  38. checkpoint-20000/embodiment_id.json +0 -11
  39. checkpoint-20000/experiment_cfg/conf.yaml +0 -206
  40. checkpoint-20000/experiment_cfg/config.yaml +0 -239
  41. checkpoint-20000/experiment_cfg/dataset_statistics.json +0 -149
  42. checkpoint-20000/experiment_cfg/final_model_config.json +0 -54
  43. checkpoint-20000/experiment_cfg/final_processor_config.json +0 -0
  44. checkpoint-20000/model-00001-of-00002.safetensors +0 -3
  45. checkpoint-20000/model-00002-of-00002.safetensors +0 -3
  46. checkpoint-20000/model.safetensors.index.json +0 -0
  47. checkpoint-20000/optimizer.pt +0 -3
  48. checkpoint-20000/processor_config.json +0 -455
  49. checkpoint-20000/rng_state.pth +0 -3
  50. checkpoint-20000/scheduler.pt +0 -3
checkpoint-10000/config.json DELETED
@@ -1,70 +0,0 @@
1
- {
2
- "action_horizon": 50,
3
- "add_pos_embed": true,
4
- "apply_sincos_state_encoding": true,
5
- "architectures": [
6
- "Gr00tN1d6"
7
- ],
8
- "attn_dropout": 0.2,
9
- "attn_implementation": null,
10
- "backbone_embedding_dim": 2048,
11
- "backbone_model_type": "eagle",
12
- "backbone_trainable_params_fp32": true,
13
- "collator_overwrite_image_inputs": false,
14
- "color_jitter_params": {
15
- "brightness": 0.1,
16
- "contrast": 0.1,
17
- "hue": 0.1,
18
- "saturation": 0.1
19
- },
20
- "crop_fraction": 0.95,
21
- "diffusion_model_cfg": {
22
- "attention_head_dim": 48,
23
- "dropout": 0.2,
24
- "final_dropout": true,
25
- "interleave_self_attention": true,
26
- "norm_type": "ada_norm",
27
- "num_attention_heads": 32,
28
- "num_layers": 32,
29
- "output_dim": 1024,
30
- "positional_embeddings": null
31
- },
32
- "eagle_collator": true,
33
- "formalize_language": true,
34
- "gemma_collator": false,
35
- "hidden_size": 1024,
36
- "image_crop_size": null,
37
- "image_target_size": null,
38
- "input_embedding_dim": 1536,
39
- "load_bf16": true,
40
- "max_action_dim": 128,
41
- "max_num_embodiments": 32,
42
- "max_seq_len": 1024,
43
- "max_state_dim": 128,
44
- "model_dtype": "bfloat16",
45
- "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
- "model_type": "Gr00tN1d6",
47
- "noise_beta_alpha": 1.5,
48
- "noise_beta_beta": 1.0,
49
- "noise_s": 0.999,
50
- "num_inference_timesteps": 4,
51
- "num_timestep_buckets": 1000,
52
- "random_rotation_angle": null,
53
- "reproject_vision": false,
54
- "select_layer": 16,
55
- "shortest_image_edge": 256,
56
- "state_dropout_prob": 0.0,
57
- "torch_dtype": "bfloat16",
58
- "transformers_version": "4.51.3",
59
- "tune_diffusion_model": true,
60
- "tune_llm": false,
61
- "tune_projector": true,
62
- "tune_top_llm_layers": 4,
63
- "tune_visual": false,
64
- "tune_vlln": true,
65
- "use_albumentations_transforms": true,
66
- "use_alternate_vl_dit": true,
67
- "use_flash_attention": true,
68
- "use_relative_action": true,
69
- "use_vlln": true
70
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-10000/embodiment_id.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "robocasa_panda_omron": 13,
3
- "gr1": 20,
4
- "behavior_r1_pro": 24,
5
- "unitree_g1": 8,
6
- "oxe_google": 0,
7
- "oxe_widowx": 1,
8
- "libero_panda": 2,
9
- "oxe_droid": 16,
10
- "new_embodiment": 10
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-10000/experiment_cfg/conf.yaml DELETED
@@ -1,206 +0,0 @@
1
- load_config_path: null
2
- model:
3
- model_type: Gr00tN1d6
4
- model_dtype: bfloat16
5
- model_name: nvidia/Eagle-Block2A-2B-v2
6
- backbone_model_type: eagle
7
- model_revision: null
8
- tune_top_llm_layers: 4
9
- backbone_embedding_dim: 2048
10
- tune_llm: false
11
- tune_visual: false
12
- select_layer: 16
13
- reproject_vision: false
14
- use_flash_attention: true
15
- load_bf16: false
16
- collator_overwrite_image_inputs: false
17
- eagle_collator: true
18
- backbone_trainable_params_fp32: true
19
- image_crop_size: null
20
- image_target_size: null
21
- shortest_image_edge: 256
22
- crop_fraction: 0.95
23
- random_rotation_angle: null
24
- color_jitter_params: null
25
- use_albumentations_transforms: true
26
- extra_augmentation_config: null
27
- formalize_language: true
28
- apply_sincos_state_encoding: false
29
- use_relative_action: true
30
- max_state_dim: 29
31
- max_action_dim: 29
32
- action_horizon: 16
33
- hidden_size: 1024
34
- input_embedding_dim: 1536
35
- add_pos_embed: true
36
- attn_dropout: 0.2
37
- use_vlln: true
38
- max_seq_len: 1024
39
- use_alternate_vl_dit: true
40
- attend_text_every_n_blocks: 2
41
- diffusion_model_cfg:
42
- positional_embeddings: null
43
- num_layers: 32
44
- num_attention_heads: 32
45
- attention_head_dim: 48
46
- norm_type: ada_norm
47
- dropout: 0.2
48
- final_dropout: true
49
- output_dim: 1024
50
- interleave_self_attention: true
51
- num_inference_timesteps: 4
52
- noise_beta_alpha: 1.5
53
- noise_beta_beta: 1.0
54
- noise_s: 0.999
55
- num_timestep_buckets: 1000
56
- tune_projector: true
57
- tune_diffusion_model: true
58
- tune_vlln: true
59
- state_dropout_prob: 0.0
60
- state_additive_noise_scale: 0.0
61
- max_num_embodiments: 32
62
- data:
63
- datasets:
64
- - dataset_paths:
65
- - ./cherry_data
66
- embodiment_tag: new_embodiment
67
- mix_ratio: 1.0
68
- dataset_type: physical_embodiment
69
- val_dataset_path: null
70
- modality_configs:
71
- new_embodiment:
72
- video:
73
- delta_indices:
74
- - 0
75
- modality_keys:
76
- - cam_base
77
- - cam_wrist
78
- sin_cos_embedding_keys: null
79
- mean_std_embedding_keys: null
80
- action_configs: null
81
- state:
82
- delta_indices:
83
- - 0
84
- modality_keys:
85
- - arm
86
- - gripper
87
- sin_cos_embedding_keys: null
88
- mean_std_embedding_keys: null
89
- action_configs: null
90
- action:
91
- delta_indices:
92
- - 0
93
- - 1
94
- - 2
95
- - 3
96
- - 4
97
- - 5
98
- - 6
99
- - 7
100
- - 8
101
- - 9
102
- - 10
103
- - 11
104
- - 12
105
- - 13
106
- - 14
107
- - 15
108
- modality_keys:
109
- - arm
110
- - gripper
111
- sin_cos_embedding_keys: null
112
- mean_std_embedding_keys: null
113
- action_configs:
114
- - rep: ABSOLUTE
115
- type: NON_EEF
116
- format: DEFAULT
117
- state_key: null
118
- - rep: ABSOLUTE
119
- type: NON_EEF
120
- format: DEFAULT
121
- state_key: null
122
- language:
123
- delta_indices:
124
- - 0
125
- modality_keys:
126
- - annotation.human.task_description
127
- sin_cos_embedding_keys: null
128
- mean_std_embedding_keys: null
129
- action_configs: null
130
- download_cache: false
131
- shard_size: 1024
132
- episode_sampling_rate: 0.1
133
- num_shards_per_epoch: 100000
134
- override_pretraining_statistics: false
135
- mode: single_turn
136
- random_chop: 0.0
137
- mock_dataset_mode: false
138
- shuffle: true
139
- seed: 42
140
- multiprocessing_context: fork
141
- allow_padding: false
142
- subsample_ratio: 1.0
143
- image_crop_size:
144
- - 244
145
- - 244
146
- image_target_size:
147
- - 224
148
- - 224
149
- video_backend: decord
150
- training:
151
- output_dir: ./outputs/fr5_cherry
152
- experiment_name: null
153
- max_steps: 20000
154
- global_batch_size: 32
155
- batch_size: null
156
- gradient_accumulation_steps: 1
157
- learning_rate: 0.0001
158
- lr_scheduler_type: cosine
159
- weight_decay: 1.0e-05
160
- warmup_ratio: 0.05
161
- warmup_steps: 0
162
- max_grad_norm: 1.0
163
- optim: adamw_torch
164
- start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
165
- tf32: true
166
- fp16: false
167
- bf16: true
168
- eval_bf16: true
169
- logging_steps: 10
170
- save_steps: 5000
171
- save_total_limit: 5
172
- save_vl_model: false
173
- upload_checkpoints: false
174
- upload_every: 1000
175
- upload_last_n_checkpoints: 5
176
- max_concurrent_uploads: 2
177
- eval_strategy: 'no'
178
- eval_steps: 500
179
- eval_set_split_ratio: 0.1
180
- eval_batch_size: 2
181
- save_best_eval_metric_name: ''
182
- save_best_eval_metric_greater_is_better: true
183
- deepspeed_stage: 2
184
- gradient_checkpointing: false
185
- transformers_trust_remote_code: true
186
- transformers_local_files_only: false
187
- transformers_cache_dir: null
188
- transformers_access_token: null
189
- use_ddp: false
190
- ddp_bucket_cap_mb: 100
191
- num_gpus: 1
192
- dataloader_num_workers: 2
193
- remove_unused_columns: false
194
- use_wandb: false
195
- wandb_project: finetune-gr00t-n1d6
196
- enable_profiling: false
197
- max_retries: 3
198
- assert_loss_less_than: null
199
- add_rl_callback: false
200
- enable_open_loop_eval: false
201
- open_loop_eval_traj_ids:
202
- - 0
203
- open_loop_eval_steps_per_traj: 100
204
- open_loop_eval_plot_indices: null
205
- max_steps: 20000
206
- save_steps: 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-10000/experiment_cfg/config.yaml DELETED
@@ -1,239 +0,0 @@
1
- !!python/object:gr00t.configs.base_config.Config
2
- data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
- allow_padding: false
4
- datasets:
5
- - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
- dataset_paths:
7
- - ./cherry_data
8
- dataset_type: physical_embodiment
9
- embodiment_tag: new_embodiment
10
- mix_ratio: 1.0
11
- val_dataset_path: null
12
- download_cache: false
13
- episode_sampling_rate: 0.1
14
- image_crop_size:
15
- - 244
16
- - 244
17
- image_target_size:
18
- - 224
19
- - 224
20
- mock_dataset_mode: false
21
- modality_configs:
22
- new_embodiment:
23
- action: !!python/object:gr00t.data.types.ModalityConfig
24
- action_configs:
25
- - !!python/object:gr00t.data.types.ActionConfig
26
- format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
27
- - default
28
- rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
29
- - absolute
30
- state_key: null
31
- type: &id003 !!python/object/apply:gr00t.data.types.ActionType
32
- - non_eef
33
- - !!python/object:gr00t.data.types.ActionConfig
34
- format: *id001
35
- rep: *id002
36
- state_key: null
37
- type: *id003
38
- delta_indices:
39
- - 0
40
- - 1
41
- - 2
42
- - 3
43
- - 4
44
- - 5
45
- - 6
46
- - 7
47
- - 8
48
- - 9
49
- - 10
50
- - 11
51
- - 12
52
- - 13
53
- - 14
54
- - 15
55
- mean_std_embedding_keys: null
56
- modality_keys:
57
- - arm
58
- - gripper
59
- sin_cos_embedding_keys: null
60
- language: !!python/object:gr00t.data.types.ModalityConfig
61
- action_configs: null
62
- delta_indices:
63
- - 0
64
- mean_std_embedding_keys: null
65
- modality_keys:
66
- - annotation.human.task_description
67
- sin_cos_embedding_keys: null
68
- state: !!python/object:gr00t.data.types.ModalityConfig
69
- action_configs: null
70
- delta_indices:
71
- - 0
72
- mean_std_embedding_keys: null
73
- modality_keys:
74
- - arm
75
- - gripper
76
- sin_cos_embedding_keys: null
77
- video: !!python/object:gr00t.data.types.ModalityConfig
78
- action_configs: null
79
- delta_indices:
80
- - 0
81
- mean_std_embedding_keys: null
82
- modality_keys:
83
- - cam_base
84
- - cam_wrist
85
- sin_cos_embedding_keys: null
86
- mode: single_turn
87
- multiprocessing_context: fork
88
- num_shards_per_epoch: 100000
89
- override_pretraining_statistics: false
90
- random_chop: 0.0
91
- seed: 42
92
- shard_size: 1024
93
- shuffle: true
94
- subsample_ratio: 1.0
95
- video_backend: decord
96
- load_config_path: null
97
- model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
98
- _attn_implementation_autoset: false
99
- _attn_implementation_internal: null
100
- _commit_hash: null
101
- _name_or_path: ''
102
- add_cross_attention: false
103
- architectures: null
104
- backbone_model_type: eagle
105
- backbone_trainable_params_fp32: true
106
- bad_words_ids: null
107
- begin_suppress_tokens: null
108
- bos_token_id: null
109
- chunk_size_feed_forward: 0
110
- color_jitter_params: null
111
- cross_attention_hidden_size: null
112
- decoder_start_token_id: null
113
- diffusion_model_cfg:
114
- attention_head_dim: 48
115
- dropout: 0.2
116
- final_dropout: true
117
- interleave_self_attention: true
118
- norm_type: ada_norm
119
- num_attention_heads: 32
120
- num_layers: 32
121
- output_dim: 1024
122
- positional_embeddings: null
123
- diversity_penalty: 0.0
124
- do_sample: false
125
- eagle_collator: true
126
- early_stopping: false
127
- encoder_no_repeat_ngram_size: 0
128
- eos_token_id: null
129
- exponential_decay_length_penalty: null
130
- extra_augmentation_config: null
131
- finetuning_task: null
132
- forced_bos_token_id: null
133
- forced_eos_token_id: null
134
- id2label:
135
- 0: LABEL_0
136
- 1: LABEL_1
137
- is_decoder: false
138
- is_encoder_decoder: false
139
- label2id:
140
- LABEL_0: 0
141
- LABEL_1: 1
142
- length_penalty: 1.0
143
- load_bf16: false
144
- max_length: 20
145
- min_length: 0
146
- model_name: nvidia/Eagle-Block2A-2B-v2
147
- no_repeat_ngram_size: 0
148
- num_beam_groups: 1
149
- num_beams: 1
150
- num_return_sequences: 1
151
- output_attentions: false
152
- output_hidden_states: false
153
- output_scores: false
154
- pad_token_id: null
155
- prefix: null
156
- problem_type: null
157
- pruned_heads: {}
158
- random_rotation_angle: null
159
- remove_invalid_values: false
160
- repetition_penalty: 1.0
161
- reproject_vision: false
162
- return_dict: true
163
- return_dict_in_generate: false
164
- sep_token_id: null
165
- state_dropout_prob: 0.0
166
- suppress_tokens: null
167
- task_specific_params: null
168
- temperature: 1.0
169
- tf_legacy_loss: false
170
- tie_encoder_decoder: false
171
- tie_word_embeddings: true
172
- tokenizer_class: null
173
- top_k: 50
174
- top_p: 1.0
175
- torch_dtype: null
176
- torchscript: false
177
- transformers_version: null
178
- tune_diffusion_model: true
179
- tune_llm: false
180
- tune_projector: true
181
- tune_visual: false
182
- typical_p: 1.0
183
- use_bfloat16: false
184
- use_relative_action: true
185
- training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
186
- add_rl_callback: false
187
- assert_loss_less_than: null
188
- batch_size: null
189
- bf16: true
190
- dataloader_num_workers: 2
191
- ddp_bucket_cap_mb: 100
192
- deepspeed_stage: 2
193
- enable_open_loop_eval: false
194
- enable_profiling: false
195
- eval_batch_size: 2
196
- eval_bf16: true
197
- eval_set_split_ratio: 0.1
198
- eval_steps: 500
199
- eval_strategy: 'no'
200
- experiment_name: null
201
- fp16: false
202
- global_batch_size: 32
203
- gradient_accumulation_steps: 1
204
- gradient_checkpointing: false
205
- learning_rate: 0.0001
206
- logging_steps: 10
207
- lr_scheduler_type: cosine
208
- max_concurrent_uploads: 2
209
- max_grad_norm: 1.0
210
- max_retries: 3
211
- max_steps: 20000
212
- num_gpus: 1
213
- open_loop_eval_plot_indices: null
214
- open_loop_eval_steps_per_traj: 100
215
- open_loop_eval_traj_ids:
216
- - 0
217
- optim: adamw_torch
218
- output_dir: ./outputs/fr5_cherry
219
- remove_unused_columns: false
220
- save_best_eval_metric_greater_is_better: true
221
- save_best_eval_metric_name: ''
222
- save_steps: 5000
223
- save_total_limit: 5
224
- save_vl_model: false
225
- start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
226
- tf32: true
227
- transformers_access_token: null
228
- transformers_cache_dir: null
229
- transformers_local_files_only: false
230
- transformers_trust_remote_code: true
231
- upload_checkpoints: false
232
- upload_every: 1000
233
- upload_last_n_checkpoints: 5
234
- use_ddp: false
235
- use_wandb: false
236
- wandb_project: finetune-gr00t-n1d6
237
- warmup_ratio: 0.05
238
- warmup_steps: 0
239
- weight_decay: 1.0e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-10000/experiment_cfg/dataset_statistics.json DELETED
@@ -1,149 +0,0 @@
1
- {
2
- "new_embodiment": {
3
- "state": {
4
- "arm": {
5
- "min": [
6
- 0.4818978011608124,
7
- -1.687173843383789,
8
- 0.62826007604599,
9
- -2.6761701107025146,
10
- -1.8431425094604492,
11
- -0.5678880214691162
12
- ],
13
- "max": [
14
- 0.7935351133346558,
15
- -1.014952301979065,
16
- 1.8637524843215942,
17
- -1.0820374488830566,
18
- -1.4455490112304688,
19
- 0.3115537762641907
20
- ],
21
- "mean": [
22
- 0.6489784717559814,
23
- -1.3269319534301758,
24
- 1.356391429901123,
25
- -1.804563045501709,
26
- -1.619696021080017,
27
- -0.07974076271057129
28
- ],
29
- "std": [
30
- 0.053538445383310186,
31
- 0.1604488044977188,
32
- 0.2438623011112213,
33
- 0.22075510025024414,
34
- 0.07333532720804177,
35
- 0.10092151165008545
36
- ],
37
- "q01": [
38
- 0.5197953635454178,
39
- -1.6432996988296509,
40
- 0.8626433879137039,
41
- -2.5542680168151857,
42
- -1.8000394713878631,
43
- -0.37301090329885483
44
- ],
45
- "q99": [
46
- 0.7509180748462676,
47
- -1.0879072868824005,
48
- 1.7959050333499906,
49
- -1.252977850437165,
50
- -1.4705305182933812,
51
- 0.2933953133225437
52
- ]
53
- },
54
- "gripper": {
55
- "min": [
56
- 0.0
57
- ],
58
- "max": [
59
- 1.0
60
- ],
61
- "mean": [
62
- 0.7650123238563538
63
- ],
64
- "std": [
65
- 0.39907386898994446
66
- ],
67
- "q01": [
68
- 0.0
69
- ],
70
- "q99": [
71
- 1.0
72
- ]
73
- }
74
- },
75
- "action": {
76
- "arm": {
77
- "min": [
78
- 0.4818978011608124,
79
- -1.687173843383789,
80
- 0.62826007604599,
81
- -2.6573522090911865,
82
- -1.8431425094604492,
83
- -0.5678880214691162
84
- ],
85
- "max": [
86
- 0.7935351133346558,
87
- -1.014952301979065,
88
- 1.8637524843215942,
89
- -1.0820374488830566,
90
- -1.4455490112304688,
91
- 0.3115537762641907
92
- ],
93
- "mean": [
94
- 0.6489997506141663,
95
- -1.326717495918274,
96
- 1.355955958366394,
97
- -1.8026670217514038,
98
- -1.6199865341186523,
99
- -0.07982920855283737
100
- ],
101
- "std": [
102
- 0.05358240380883204,
103
- 0.16021256148815155,
104
- 0.243374302983284,
105
- 0.2178075611591339,
106
- 0.07321629673242531,
107
- 0.10097639262676239
108
- ],
109
- "q01": [
110
- 0.5197953635454178,
111
- -1.6432996988296509,
112
- 0.8625765931606293,
113
- -2.53433034658432,
114
- -1.8000823378562927,
115
- -0.37301090329885483
116
- ],
117
- "q99": [
118
- 0.7509180748462676,
119
- -1.0879072868824005,
120
- 1.7849992513656616,
121
- -1.2526323044300085,
122
- -1.4705633461475374,
123
- 0.2933953133225437
124
- ]
125
- },
126
- "gripper": {
127
- "min": [
128
- 0.0
129
- ],
130
- "max": [
131
- 1.0
132
- ],
133
- "mean": [
134
- 0.7650123238563538
135
- ],
136
- "std": [
137
- 0.39907386898994446
138
- ],
139
- "q01": [
140
- 0.0
141
- ],
142
- "q99": [
143
- 1.0
144
- ]
145
- }
146
- },
147
- "relative_action": {}
148
- }
149
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-10000/experiment_cfg/final_model_config.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "model_type": "Gr00tN1d6",
3
- "model_dtype": "bfloat16",
4
- "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
- "backbone_model_type": "eagle",
6
- "model_revision": null,
7
- "tune_top_llm_layers": 4,
8
- "backbone_embedding_dim": 2048,
9
- "tune_llm": false,
10
- "tune_visual": false,
11
- "select_layer": 16,
12
- "reproject_vision": false,
13
- "use_flash_attention": true,
14
- "load_bf16": true,
15
- "collator_overwrite_image_inputs": false,
16
- "eagle_collator": true,
17
- "backbone_trainable_params_fp32": true,
18
- "extra_augmentation_config": null,
19
- "apply_sincos_state_encoding": true,
20
- "use_relative_action": true,
21
- "max_state_dim": 128,
22
- "max_action_dim": 128,
23
- "action_horizon": 50,
24
- "hidden_size": 1024,
25
- "input_embedding_dim": 1536,
26
- "add_pos_embed": true,
27
- "attn_dropout": 0.2,
28
- "use_vlln": true,
29
- "max_seq_len": 1024,
30
- "use_alternate_vl_dit": true,
31
- "attend_text_every_n_blocks": 2,
32
- "diffusion_model_cfg": {
33
- "attention_head_dim": 48,
34
- "dropout": 0.2,
35
- "final_dropout": true,
36
- "interleave_self_attention": true,
37
- "norm_type": "ada_norm",
38
- "num_attention_heads": 32,
39
- "num_layers": 32,
40
- "output_dim": 1024,
41
- "positional_embeddings": null
42
- },
43
- "num_inference_timesteps": 4,
44
- "noise_beta_alpha": 1.5,
45
- "noise_beta_beta": 1.0,
46
- "noise_s": 0.999,
47
- "num_timestep_buckets": 1000,
48
- "tune_projector": true,
49
- "tune_diffusion_model": true,
50
- "tune_vlln": true,
51
- "state_dropout_prob": 0.0,
52
- "state_additive_noise_scale": 0.0,
53
- "max_num_embodiments": 32
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-10000/experiment_cfg/final_processor_config.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/model-00001-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a22551925e5bb41c48ebb3cd8533566607a4e966f51d34097f53fbe937a91659
3
- size 4990120184
 
 
 
 
checkpoint-10000/model-00002-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:01a8b9f9243a6606b220e9ecfdf9c90271caecbf6a218d878817c602c0be7a3e
3
- size 4823190320
 
 
 
 
checkpoint-10000/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1daa28f457b70c4bbcb6ef25875f17b75f9c587aee1da4f54bc2f8a7177d9af
3
- size 12960193762
 
 
 
 
checkpoint-10000/processor_config.json DELETED
@@ -1,455 +0,0 @@
1
- {
2
- "processor_class": "Gr00tN1d6Processor",
3
- "processor_kwargs": {
4
- "modality_configs": {
5
- "behavior_r1_pro": {
6
- "video": {
7
- "delta_indices": [
8
- 0
9
- ],
10
- "modality_keys": [
11
- "observation.images.rgb.head_256_256",
12
- "observation.images.rgb.left_wrist_256_256",
13
- "observation.images.rgb.right_wrist_256_256"
14
- ],
15
- "sin_cos_embedding_keys": null,
16
- "mean_std_embedding_keys": null,
17
- "action_configs": null
18
- },
19
- "state": {
20
- "delta_indices": [
21
- 0
22
- ],
23
- "modality_keys": [
24
- "robot_pos",
25
- "robot_ori_cos",
26
- "robot_ori_sin",
27
- "robot_2d_ori",
28
- "robot_2d_ori_cos",
29
- "robot_2d_ori_sin",
30
- "robot_lin_vel",
31
- "robot_ang_vel",
32
- "arm_left_qpos",
33
- "arm_left_qpos_sin",
34
- "arm_left_qpos_cos",
35
- "eef_left_pos",
36
- "eef_left_quat",
37
- "gripper_left_qpos",
38
- "arm_right_qpos",
39
- "arm_right_qpos_sin",
40
- "arm_right_qpos_cos",
41
- "eef_right_pos",
42
- "eef_right_quat",
43
- "gripper_right_qpos",
44
- "trunk_qpos"
45
- ],
46
- "sin_cos_embedding_keys": null,
47
- "mean_std_embedding_keys": null,
48
- "action_configs": null
49
- },
50
- "action": {
51
- "delta_indices": [
52
- 0,
53
- 1,
54
- 2,
55
- 3,
56
- 4,
57
- 5,
58
- 6,
59
- 7,
60
- 8,
61
- 9,
62
- 10,
63
- 11,
64
- 12,
65
- 13,
66
- 14,
67
- 15,
68
- 16,
69
- 17,
70
- 18,
71
- 19,
72
- 20,
73
- 21,
74
- 22,
75
- 23,
76
- 24,
77
- 25,
78
- 26,
79
- 27,
80
- 28,
81
- 29,
82
- 30,
83
- 31
84
- ],
85
- "modality_keys": [
86
- "base",
87
- "torso",
88
- "left_arm",
89
- "left_gripper",
90
- "right_arm",
91
- "right_gripper"
92
- ],
93
- "sin_cos_embedding_keys": null,
94
- "mean_std_embedding_keys": null,
95
- "action_configs": [
96
- {
97
- "rep": "ABSOLUTE",
98
- "type": "NON_EEF",
99
- "format": "DEFAULT",
100
- "state_key": null
101
- },
102
- {
103
- "rep": "RELATIVE",
104
- "type": "NON_EEF",
105
- "format": "DEFAULT",
106
- "state_key": "trunk_qpos"
107
- },
108
- {
109
- "rep": "RELATIVE",
110
- "type": "NON_EEF",
111
- "format": "DEFAULT",
112
- "state_key": "arm_left_qpos"
113
- },
114
- {
115
- "rep": "ABSOLUTE",
116
- "type": "NON_EEF",
117
- "format": "DEFAULT",
118
- "state_key": null
119
- },
120
- {
121
- "rep": "RELATIVE",
122
- "type": "NON_EEF",
123
- "format": "DEFAULT",
124
- "state_key": "arm_right_qpos"
125
- },
126
- {
127
- "rep": "ABSOLUTE",
128
- "type": "NON_EEF",
129
- "format": "DEFAULT",
130
- "state_key": null
131
- }
132
- ]
133
- },
134
- "language": {
135
- "delta_indices": [
136
- 0
137
- ],
138
- "modality_keys": [
139
- "annotation.human.coarse_action"
140
- ],
141
- "sin_cos_embedding_keys": null,
142
- "mean_std_embedding_keys": null,
143
- "action_configs": null
144
- }
145
- },
146
- "gr1": {
147
- "video": {
148
- "delta_indices": [
149
- 0
150
- ],
151
- "modality_keys": [
152
- "ego_view_bg_crop_pad_res256_freq20"
153
- ],
154
- "sin_cos_embedding_keys": null,
155
- "mean_std_embedding_keys": null,
156
- "action_configs": null
157
- },
158
- "state": {
159
- "delta_indices": [
160
- 0
161
- ],
162
- "modality_keys": [
163
- "left_arm",
164
- "right_arm",
165
- "left_hand",
166
- "right_hand",
167
- "waist"
168
- ],
169
- "sin_cos_embedding_keys": [
170
- "left_arm",
171
- "right_arm",
172
- "left_hand",
173
- "right_hand",
174
- "waist"
175
- ],
176
- "mean_std_embedding_keys": null,
177
- "action_configs": null
178
- },
179
- "action": {
180
- "delta_indices": [
181
- 0,
182
- 1,
183
- 2,
184
- 3,
185
- 4,
186
- 5,
187
- 6,
188
- 7,
189
- 8,
190
- 9,
191
- 10,
192
- 11,
193
- 12,
194
- 13,
195
- 14,
196
- 15
197
- ],
198
- "modality_keys": [
199
- "left_arm",
200
- "right_arm",
201
- "left_hand",
202
- "right_hand",
203
- "waist"
204
- ],
205
- "sin_cos_embedding_keys": null,
206
- "mean_std_embedding_keys": null,
207
- "action_configs": [
208
- {
209
- "rep": "RELATIVE",
210
- "type": "NON_EEF",
211
- "format": "DEFAULT",
212
- "state_key": null
213
- },
214
- {
215
- "rep": "RELATIVE",
216
- "type": "NON_EEF",
217
- "format": "DEFAULT",
218
- "state_key": null
219
- },
220
- {
221
- "rep": "RELATIVE",
222
- "type": "NON_EEF",
223
- "format": "DEFAULT",
224
- "state_key": null
225
- },
226
- {
227
- "rep": "RELATIVE",
228
- "type": "NON_EEF",
229
- "format": "DEFAULT",
230
- "state_key": null
231
- },
232
- {
233
- "rep": "ABSOLUTE",
234
- "type": "NON_EEF",
235
- "format": "DEFAULT",
236
- "state_key": null
237
- }
238
- ]
239
- },
240
- "language": {
241
- "delta_indices": [
242
- 0
243
- ],
244
- "modality_keys": [
245
- "task"
246
- ],
247
- "sin_cos_embedding_keys": null,
248
- "mean_std_embedding_keys": null,
249
- "action_configs": null
250
- }
251
- },
252
- "robocasa_panda_omron": {
253
- "video": {
254
- "delta_indices": [
255
- 0
256
- ],
257
- "modality_keys": [
258
- "res256_image_side_0",
259
- "res256_image_side_1",
260
- "res256_image_wrist_0"
261
- ],
262
- "sin_cos_embedding_keys": null,
263
- "mean_std_embedding_keys": null,
264
- "action_configs": null
265
- },
266
- "state": {
267
- "delta_indices": [
268
- 0
269
- ],
270
- "modality_keys": [
271
- "end_effector_position_relative",
272
- "end_effector_rotation_relative",
273
- "gripper_qpos",
274
- "base_position",
275
- "base_rotation"
276
- ],
277
- "sin_cos_embedding_keys": null,
278
- "mean_std_embedding_keys": null,
279
- "action_configs": null
280
- },
281
- "action": {
282
- "delta_indices": [
283
- 0,
284
- 1,
285
- 2,
286
- 3,
287
- 4,
288
- 5,
289
- 6,
290
- 7,
291
- 8,
292
- 9,
293
- 10,
294
- 11,
295
- 12,
296
- 13,
297
- 14,
298
- 15
299
- ],
300
- "modality_keys": [
301
- "end_effector_position",
302
- "end_effector_rotation",
303
- "gripper_close",
304
- "base_motion",
305
- "control_mode"
306
- ],
307
- "sin_cos_embedding_keys": null,
308
- "mean_std_embedding_keys": null,
309
- "action_configs": [
310
- {
311
- "rep": "ABSOLUTE",
312
- "type": "NON_EEF",
313
- "format": "DEFAULT",
314
- "state_key": null
315
- },
316
- {
317
- "rep": "ABSOLUTE",
318
- "type": "NON_EEF",
319
- "format": "DEFAULT",
320
- "state_key": null
321
- },
322
- {
323
- "rep": "ABSOLUTE",
324
- "type": "NON_EEF",
325
- "format": "DEFAULT",
326
- "state_key": null
327
- },
328
- {
329
- "rep": "ABSOLUTE",
330
- "type": "NON_EEF",
331
- "format": "DEFAULT",
332
- "state_key": null
333
- },
334
- {
335
- "rep": "ABSOLUTE",
336
- "type": "NON_EEF",
337
- "format": "DEFAULT",
338
- "state_key": null
339
- }
340
- ]
341
- },
342
- "language": {
343
- "delta_indices": [
344
- 0
345
- ],
346
- "modality_keys": [
347
- "annotation.human.action.task_description"
348
- ],
349
- "sin_cos_embedding_keys": null,
350
- "mean_std_embedding_keys": null,
351
- "action_configs": null
352
- }
353
- },
354
- "new_embodiment": {
355
- "video": {
356
- "delta_indices": [
357
- 0
358
- ],
359
- "modality_keys": [
360
- "cam_base",
361
- "cam_wrist"
362
- ],
363
- "sin_cos_embedding_keys": null,
364
- "mean_std_embedding_keys": null,
365
- "action_configs": null
366
- },
367
- "state": {
368
- "delta_indices": [
369
- 0
370
- ],
371
- "modality_keys": [
372
- "arm",
373
- "gripper"
374
- ],
375
- "sin_cos_embedding_keys": null,
376
- "mean_std_embedding_keys": null,
377
- "action_configs": null
378
- },
379
- "action": {
380
- "delta_indices": [
381
- 0,
382
- 1,
383
- 2,
384
- 3,
385
- 4,
386
- 5,
387
- 6,
388
- 7,
389
- 8,
390
- 9,
391
- 10,
392
- 11,
393
- 12,
394
- 13,
395
- 14,
396
- 15
397
- ],
398
- "modality_keys": [
399
- "arm",
400
- "gripper"
401
- ],
402
- "sin_cos_embedding_keys": null,
403
- "mean_std_embedding_keys": null,
404
- "action_configs": [
405
- {
406
- "rep": "ABSOLUTE",
407
- "type": "NON_EEF",
408
- "format": "DEFAULT",
409
- "state_key": null
410
- },
411
- {
412
- "rep": "ABSOLUTE",
413
- "type": "NON_EEF",
414
- "format": "DEFAULT",
415
- "state_key": null
416
- }
417
- ]
418
- },
419
- "language": {
420
- "delta_indices": [
421
- 0
422
- ],
423
- "modality_keys": [
424
- "annotation.human.task_description"
425
- ],
426
- "sin_cos_embedding_keys": null,
427
- "mean_std_embedding_keys": null,
428
- "action_configs": null
429
- }
430
- }
431
- },
432
- "image_crop_size": null,
433
- "image_target_size": null,
434
- "use_albumentations": true,
435
- "random_rotation_angle": null,
436
- "color_jitter_params": {
437
- "brightness": 0.3,
438
- "contrast": 0.4,
439
- "saturation": 0.5,
440
- "hue": 0.08
441
- },
442
- "shortest_image_edge": 256,
443
- "crop_fraction": 0.95,
444
- "model_name": "nvidia/Eagle-Block2A-2B-v2",
445
- "model_type": "eagle",
446
- "formalize_language": true,
447
- "max_state_dim": 128,
448
- "max_action_dim": 128,
449
- "max_action_horizon": 50,
450
- "use_percentiles": false,
451
- "clip_outliers": true,
452
- "apply_sincos_state_encoding": true,
453
- "use_relative_action": true
454
- }
455
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-10000/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f5ea38e8fe73fc84868ec3e6011e3571a59e9595fe7e6de70a21b520b40798f
3
- size 14645
 
 
 
 
checkpoint-10000/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9793ce798c508501a83c7de96591494637b8ee20bbad166324a3caeddb4cdc8e
3
- size 1465
 
 
 
 
checkpoint-10000/statistics.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:49cf1bb9160fba423ccdd2dae7d9b40228772dd15a3fb17d44a4fd07f7818f45
3
- size 5713
 
 
 
 
checkpoint-10000/wandb_config.json DELETED
@@ -1 +0,0 @@
1
- {"project": "finetune-gr00t-n1d6", "run_id": "fr5_cherry"}
 
 
checkpoint-15000/config.json DELETED
@@ -1,70 +0,0 @@
1
- {
2
- "action_horizon": 50,
3
- "add_pos_embed": true,
4
- "apply_sincos_state_encoding": true,
5
- "architectures": [
6
- "Gr00tN1d6"
7
- ],
8
- "attn_dropout": 0.2,
9
- "attn_implementation": null,
10
- "backbone_embedding_dim": 2048,
11
- "backbone_model_type": "eagle",
12
- "backbone_trainable_params_fp32": true,
13
- "collator_overwrite_image_inputs": false,
14
- "color_jitter_params": {
15
- "brightness": 0.1,
16
- "contrast": 0.1,
17
- "hue": 0.1,
18
- "saturation": 0.1
19
- },
20
- "crop_fraction": 0.95,
21
- "diffusion_model_cfg": {
22
- "attention_head_dim": 48,
23
- "dropout": 0.2,
24
- "final_dropout": true,
25
- "interleave_self_attention": true,
26
- "norm_type": "ada_norm",
27
- "num_attention_heads": 32,
28
- "num_layers": 32,
29
- "output_dim": 1024,
30
- "positional_embeddings": null
31
- },
32
- "eagle_collator": true,
33
- "formalize_language": true,
34
- "gemma_collator": false,
35
- "hidden_size": 1024,
36
- "image_crop_size": null,
37
- "image_target_size": null,
38
- "input_embedding_dim": 1536,
39
- "load_bf16": true,
40
- "max_action_dim": 128,
41
- "max_num_embodiments": 32,
42
- "max_seq_len": 1024,
43
- "max_state_dim": 128,
44
- "model_dtype": "bfloat16",
45
- "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
- "model_type": "Gr00tN1d6",
47
- "noise_beta_alpha": 1.5,
48
- "noise_beta_beta": 1.0,
49
- "noise_s": 0.999,
50
- "num_inference_timesteps": 4,
51
- "num_timestep_buckets": 1000,
52
- "random_rotation_angle": null,
53
- "reproject_vision": false,
54
- "select_layer": 16,
55
- "shortest_image_edge": 256,
56
- "state_dropout_prob": 0.0,
57
- "torch_dtype": "bfloat16",
58
- "transformers_version": "4.51.3",
59
- "tune_diffusion_model": true,
60
- "tune_llm": false,
61
- "tune_projector": true,
62
- "tune_top_llm_layers": 4,
63
- "tune_visual": false,
64
- "tune_vlln": true,
65
- "use_albumentations_transforms": true,
66
- "use_alternate_vl_dit": true,
67
- "use_flash_attention": true,
68
- "use_relative_action": true,
69
- "use_vlln": true
70
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-15000/embodiment_id.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "robocasa_panda_omron": 13,
3
- "gr1": 20,
4
- "behavior_r1_pro": 24,
5
- "unitree_g1": 8,
6
- "oxe_google": 0,
7
- "oxe_widowx": 1,
8
- "libero_panda": 2,
9
- "oxe_droid": 16,
10
- "new_embodiment": 10
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-15000/experiment_cfg/conf.yaml DELETED
@@ -1,206 +0,0 @@
1
- load_config_path: null
2
- model:
3
- model_type: Gr00tN1d6
4
- model_dtype: bfloat16
5
- model_name: nvidia/Eagle-Block2A-2B-v2
6
- backbone_model_type: eagle
7
- model_revision: null
8
- tune_top_llm_layers: 4
9
- backbone_embedding_dim: 2048
10
- tune_llm: false
11
- tune_visual: false
12
- select_layer: 16
13
- reproject_vision: false
14
- use_flash_attention: true
15
- load_bf16: false
16
- collator_overwrite_image_inputs: false
17
- eagle_collator: true
18
- backbone_trainable_params_fp32: true
19
- image_crop_size: null
20
- image_target_size: null
21
- shortest_image_edge: 256
22
- crop_fraction: 0.95
23
- random_rotation_angle: null
24
- color_jitter_params: null
25
- use_albumentations_transforms: true
26
- extra_augmentation_config: null
27
- formalize_language: true
28
- apply_sincos_state_encoding: false
29
- use_relative_action: true
30
- max_state_dim: 29
31
- max_action_dim: 29
32
- action_horizon: 16
33
- hidden_size: 1024
34
- input_embedding_dim: 1536
35
- add_pos_embed: true
36
- attn_dropout: 0.2
37
- use_vlln: true
38
- max_seq_len: 1024
39
- use_alternate_vl_dit: true
40
- attend_text_every_n_blocks: 2
41
- diffusion_model_cfg:
42
- positional_embeddings: null
43
- num_layers: 32
44
- num_attention_heads: 32
45
- attention_head_dim: 48
46
- norm_type: ada_norm
47
- dropout: 0.2
48
- final_dropout: true
49
- output_dim: 1024
50
- interleave_self_attention: true
51
- num_inference_timesteps: 4
52
- noise_beta_alpha: 1.5
53
- noise_beta_beta: 1.0
54
- noise_s: 0.999
55
- num_timestep_buckets: 1000
56
- tune_projector: true
57
- tune_diffusion_model: true
58
- tune_vlln: true
59
- state_dropout_prob: 0.0
60
- state_additive_noise_scale: 0.0
61
- max_num_embodiments: 32
62
- data:
63
- datasets:
64
- - dataset_paths:
65
- - ./cherry_data
66
- embodiment_tag: new_embodiment
67
- mix_ratio: 1.0
68
- dataset_type: physical_embodiment
69
- val_dataset_path: null
70
- modality_configs:
71
- new_embodiment:
72
- video:
73
- delta_indices:
74
- - 0
75
- modality_keys:
76
- - cam_base
77
- - cam_wrist
78
- sin_cos_embedding_keys: null
79
- mean_std_embedding_keys: null
80
- action_configs: null
81
- state:
82
- delta_indices:
83
- - 0
84
- modality_keys:
85
- - arm
86
- - gripper
87
- sin_cos_embedding_keys: null
88
- mean_std_embedding_keys: null
89
- action_configs: null
90
- action:
91
- delta_indices:
92
- - 0
93
- - 1
94
- - 2
95
- - 3
96
- - 4
97
- - 5
98
- - 6
99
- - 7
100
- - 8
101
- - 9
102
- - 10
103
- - 11
104
- - 12
105
- - 13
106
- - 14
107
- - 15
108
- modality_keys:
109
- - arm
110
- - gripper
111
- sin_cos_embedding_keys: null
112
- mean_std_embedding_keys: null
113
- action_configs:
114
- - rep: ABSOLUTE
115
- type: NON_EEF
116
- format: DEFAULT
117
- state_key: null
118
- - rep: ABSOLUTE
119
- type: NON_EEF
120
- format: DEFAULT
121
- state_key: null
122
- language:
123
- delta_indices:
124
- - 0
125
- modality_keys:
126
- - annotation.human.task_description
127
- sin_cos_embedding_keys: null
128
- mean_std_embedding_keys: null
129
- action_configs: null
130
- download_cache: false
131
- shard_size: 1024
132
- episode_sampling_rate: 0.1
133
- num_shards_per_epoch: 100000
134
- override_pretraining_statistics: false
135
- mode: single_turn
136
- random_chop: 0.0
137
- mock_dataset_mode: false
138
- shuffle: true
139
- seed: 42
140
- multiprocessing_context: fork
141
- allow_padding: false
142
- subsample_ratio: 1.0
143
- image_crop_size:
144
- - 244
145
- - 244
146
- image_target_size:
147
- - 224
148
- - 224
149
- video_backend: decord
150
- training:
151
- output_dir: ./outputs/fr5_cherry
152
- experiment_name: null
153
- max_steps: 20000
154
- global_batch_size: 32
155
- batch_size: null
156
- gradient_accumulation_steps: 1
157
- learning_rate: 0.0001
158
- lr_scheduler_type: cosine
159
- weight_decay: 1.0e-05
160
- warmup_ratio: 0.05
161
- warmup_steps: 0
162
- max_grad_norm: 1.0
163
- optim: adamw_torch
164
- start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
165
- tf32: true
166
- fp16: false
167
- bf16: true
168
- eval_bf16: true
169
- logging_steps: 10
170
- save_steps: 5000
171
- save_total_limit: 5
172
- save_vl_model: false
173
- upload_checkpoints: false
174
- upload_every: 1000
175
- upload_last_n_checkpoints: 5
176
- max_concurrent_uploads: 2
177
- eval_strategy: 'no'
178
- eval_steps: 500
179
- eval_set_split_ratio: 0.1
180
- eval_batch_size: 2
181
- save_best_eval_metric_name: ''
182
- save_best_eval_metric_greater_is_better: true
183
- deepspeed_stage: 2
184
- gradient_checkpointing: false
185
- transformers_trust_remote_code: true
186
- transformers_local_files_only: false
187
- transformers_cache_dir: null
188
- transformers_access_token: null
189
- use_ddp: false
190
- ddp_bucket_cap_mb: 100
191
- num_gpus: 1
192
- dataloader_num_workers: 2
193
- remove_unused_columns: false
194
- use_wandb: false
195
- wandb_project: finetune-gr00t-n1d6
196
- enable_profiling: false
197
- max_retries: 3
198
- assert_loss_less_than: null
199
- add_rl_callback: false
200
- enable_open_loop_eval: false
201
- open_loop_eval_traj_ids:
202
- - 0
203
- open_loop_eval_steps_per_traj: 100
204
- open_loop_eval_plot_indices: null
205
- max_steps: 20000
206
- save_steps: 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-15000/experiment_cfg/config.yaml DELETED
@@ -1,239 +0,0 @@
1
- !!python/object:gr00t.configs.base_config.Config
2
- data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
- allow_padding: false
4
- datasets:
5
- - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
- dataset_paths:
7
- - ./cherry_data
8
- dataset_type: physical_embodiment
9
- embodiment_tag: new_embodiment
10
- mix_ratio: 1.0
11
- val_dataset_path: null
12
- download_cache: false
13
- episode_sampling_rate: 0.1
14
- image_crop_size:
15
- - 244
16
- - 244
17
- image_target_size:
18
- - 224
19
- - 224
20
- mock_dataset_mode: false
21
- modality_configs:
22
- new_embodiment:
23
- action: !!python/object:gr00t.data.types.ModalityConfig
24
- action_configs:
25
- - !!python/object:gr00t.data.types.ActionConfig
26
- format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
27
- - default
28
- rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
29
- - absolute
30
- state_key: null
31
- type: &id003 !!python/object/apply:gr00t.data.types.ActionType
32
- - non_eef
33
- - !!python/object:gr00t.data.types.ActionConfig
34
- format: *id001
35
- rep: *id002
36
- state_key: null
37
- type: *id003
38
- delta_indices:
39
- - 0
40
- - 1
41
- - 2
42
- - 3
43
- - 4
44
- - 5
45
- - 6
46
- - 7
47
- - 8
48
- - 9
49
- - 10
50
- - 11
51
- - 12
52
- - 13
53
- - 14
54
- - 15
55
- mean_std_embedding_keys: null
56
- modality_keys:
57
- - arm
58
- - gripper
59
- sin_cos_embedding_keys: null
60
- language: !!python/object:gr00t.data.types.ModalityConfig
61
- action_configs: null
62
- delta_indices:
63
- - 0
64
- mean_std_embedding_keys: null
65
- modality_keys:
66
- - annotation.human.task_description
67
- sin_cos_embedding_keys: null
68
- state: !!python/object:gr00t.data.types.ModalityConfig
69
- action_configs: null
70
- delta_indices:
71
- - 0
72
- mean_std_embedding_keys: null
73
- modality_keys:
74
- - arm
75
- - gripper
76
- sin_cos_embedding_keys: null
77
- video: !!python/object:gr00t.data.types.ModalityConfig
78
- action_configs: null
79
- delta_indices:
80
- - 0
81
- mean_std_embedding_keys: null
82
- modality_keys:
83
- - cam_base
84
- - cam_wrist
85
- sin_cos_embedding_keys: null
86
- mode: single_turn
87
- multiprocessing_context: fork
88
- num_shards_per_epoch: 100000
89
- override_pretraining_statistics: false
90
- random_chop: 0.0
91
- seed: 42
92
- shard_size: 1024
93
- shuffle: true
94
- subsample_ratio: 1.0
95
- video_backend: decord
96
- load_config_path: null
97
- model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
98
- _attn_implementation_autoset: false
99
- _attn_implementation_internal: null
100
- _commit_hash: null
101
- _name_or_path: ''
102
- add_cross_attention: false
103
- architectures: null
104
- backbone_model_type: eagle
105
- backbone_trainable_params_fp32: true
106
- bad_words_ids: null
107
- begin_suppress_tokens: null
108
- bos_token_id: null
109
- chunk_size_feed_forward: 0
110
- color_jitter_params: null
111
- cross_attention_hidden_size: null
112
- decoder_start_token_id: null
113
- diffusion_model_cfg:
114
- attention_head_dim: 48
115
- dropout: 0.2
116
- final_dropout: true
117
- interleave_self_attention: true
118
- norm_type: ada_norm
119
- num_attention_heads: 32
120
- num_layers: 32
121
- output_dim: 1024
122
- positional_embeddings: null
123
- diversity_penalty: 0.0
124
- do_sample: false
125
- eagle_collator: true
126
- early_stopping: false
127
- encoder_no_repeat_ngram_size: 0
128
- eos_token_id: null
129
- exponential_decay_length_penalty: null
130
- extra_augmentation_config: null
131
- finetuning_task: null
132
- forced_bos_token_id: null
133
- forced_eos_token_id: null
134
- id2label:
135
- 0: LABEL_0
136
- 1: LABEL_1
137
- is_decoder: false
138
- is_encoder_decoder: false
139
- label2id:
140
- LABEL_0: 0
141
- LABEL_1: 1
142
- length_penalty: 1.0
143
- load_bf16: false
144
- max_length: 20
145
- min_length: 0
146
- model_name: nvidia/Eagle-Block2A-2B-v2
147
- no_repeat_ngram_size: 0
148
- num_beam_groups: 1
149
- num_beams: 1
150
- num_return_sequences: 1
151
- output_attentions: false
152
- output_hidden_states: false
153
- output_scores: false
154
- pad_token_id: null
155
- prefix: null
156
- problem_type: null
157
- pruned_heads: {}
158
- random_rotation_angle: null
159
- remove_invalid_values: false
160
- repetition_penalty: 1.0
161
- reproject_vision: false
162
- return_dict: true
163
- return_dict_in_generate: false
164
- sep_token_id: null
165
- state_dropout_prob: 0.0
166
- suppress_tokens: null
167
- task_specific_params: null
168
- temperature: 1.0
169
- tf_legacy_loss: false
170
- tie_encoder_decoder: false
171
- tie_word_embeddings: true
172
- tokenizer_class: null
173
- top_k: 50
174
- top_p: 1.0
175
- torch_dtype: null
176
- torchscript: false
177
- transformers_version: null
178
- tune_diffusion_model: true
179
- tune_llm: false
180
- tune_projector: true
181
- tune_visual: false
182
- typical_p: 1.0
183
- use_bfloat16: false
184
- use_relative_action: true
185
- training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
186
- add_rl_callback: false
187
- assert_loss_less_than: null
188
- batch_size: null
189
- bf16: true
190
- dataloader_num_workers: 2
191
- ddp_bucket_cap_mb: 100
192
- deepspeed_stage: 2
193
- enable_open_loop_eval: false
194
- enable_profiling: false
195
- eval_batch_size: 2
196
- eval_bf16: true
197
- eval_set_split_ratio: 0.1
198
- eval_steps: 500
199
- eval_strategy: 'no'
200
- experiment_name: null
201
- fp16: false
202
- global_batch_size: 32
203
- gradient_accumulation_steps: 1
204
- gradient_checkpointing: false
205
- learning_rate: 0.0001
206
- logging_steps: 10
207
- lr_scheduler_type: cosine
208
- max_concurrent_uploads: 2
209
- max_grad_norm: 1.0
210
- max_retries: 3
211
- max_steps: 20000
212
- num_gpus: 1
213
- open_loop_eval_plot_indices: null
214
- open_loop_eval_steps_per_traj: 100
215
- open_loop_eval_traj_ids:
216
- - 0
217
- optim: adamw_torch
218
- output_dir: ./outputs/fr5_cherry
219
- remove_unused_columns: false
220
- save_best_eval_metric_greater_is_better: true
221
- save_best_eval_metric_name: ''
222
- save_steps: 5000
223
- save_total_limit: 5
224
- save_vl_model: false
225
- start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
226
- tf32: true
227
- transformers_access_token: null
228
- transformers_cache_dir: null
229
- transformers_local_files_only: false
230
- transformers_trust_remote_code: true
231
- upload_checkpoints: false
232
- upload_every: 1000
233
- upload_last_n_checkpoints: 5
234
- use_ddp: false
235
- use_wandb: false
236
- wandb_project: finetune-gr00t-n1d6
237
- warmup_ratio: 0.05
238
- warmup_steps: 0
239
- weight_decay: 1.0e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-15000/experiment_cfg/dataset_statistics.json DELETED
@@ -1,149 +0,0 @@
1
- {
2
- "new_embodiment": {
3
- "state": {
4
- "arm": {
5
- "min": [
6
- 0.4818978011608124,
7
- -1.687173843383789,
8
- 0.62826007604599,
9
- -2.6761701107025146,
10
- -1.8431425094604492,
11
- -0.5678880214691162
12
- ],
13
- "max": [
14
- 0.7935351133346558,
15
- -1.014952301979065,
16
- 1.8637524843215942,
17
- -1.0820374488830566,
18
- -1.4455490112304688,
19
- 0.3115537762641907
20
- ],
21
- "mean": [
22
- 0.6489784717559814,
23
- -1.3269319534301758,
24
- 1.356391429901123,
25
- -1.804563045501709,
26
- -1.619696021080017,
27
- -0.07974076271057129
28
- ],
29
- "std": [
30
- 0.053538445383310186,
31
- 0.1604488044977188,
32
- 0.2438623011112213,
33
- 0.22075510025024414,
34
- 0.07333532720804177,
35
- 0.10092151165008545
36
- ],
37
- "q01": [
38
- 0.5197953635454178,
39
- -1.6432996988296509,
40
- 0.8626433879137039,
41
- -2.5542680168151857,
42
- -1.8000394713878631,
43
- -0.37301090329885483
44
- ],
45
- "q99": [
46
- 0.7509180748462676,
47
- -1.0879072868824005,
48
- 1.7959050333499906,
49
- -1.252977850437165,
50
- -1.4705305182933812,
51
- 0.2933953133225437
52
- ]
53
- },
54
- "gripper": {
55
- "min": [
56
- 0.0
57
- ],
58
- "max": [
59
- 1.0
60
- ],
61
- "mean": [
62
- 0.7650123238563538
63
- ],
64
- "std": [
65
- 0.39907386898994446
66
- ],
67
- "q01": [
68
- 0.0
69
- ],
70
- "q99": [
71
- 1.0
72
- ]
73
- }
74
- },
75
- "action": {
76
- "arm": {
77
- "min": [
78
- 0.4818978011608124,
79
- -1.687173843383789,
80
- 0.62826007604599,
81
- -2.6573522090911865,
82
- -1.8431425094604492,
83
- -0.5678880214691162
84
- ],
85
- "max": [
86
- 0.7935351133346558,
87
- -1.014952301979065,
88
- 1.8637524843215942,
89
- -1.0820374488830566,
90
- -1.4455490112304688,
91
- 0.3115537762641907
92
- ],
93
- "mean": [
94
- 0.6489997506141663,
95
- -1.326717495918274,
96
- 1.355955958366394,
97
- -1.8026670217514038,
98
- -1.6199865341186523,
99
- -0.07982920855283737
100
- ],
101
- "std": [
102
- 0.05358240380883204,
103
- 0.16021256148815155,
104
- 0.243374302983284,
105
- 0.2178075611591339,
106
- 0.07321629673242531,
107
- 0.10097639262676239
108
- ],
109
- "q01": [
110
- 0.5197953635454178,
111
- -1.6432996988296509,
112
- 0.8625765931606293,
113
- -2.53433034658432,
114
- -1.8000823378562927,
115
- -0.37301090329885483
116
- ],
117
- "q99": [
118
- 0.7509180748462676,
119
- -1.0879072868824005,
120
- 1.7849992513656616,
121
- -1.2526323044300085,
122
- -1.4705633461475374,
123
- 0.2933953133225437
124
- ]
125
- },
126
- "gripper": {
127
- "min": [
128
- 0.0
129
- ],
130
- "max": [
131
- 1.0
132
- ],
133
- "mean": [
134
- 0.7650123238563538
135
- ],
136
- "std": [
137
- 0.39907386898994446
138
- ],
139
- "q01": [
140
- 0.0
141
- ],
142
- "q99": [
143
- 1.0
144
- ]
145
- }
146
- },
147
- "relative_action": {}
148
- }
149
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-15000/experiment_cfg/final_model_config.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "model_type": "Gr00tN1d6",
3
- "model_dtype": "bfloat16",
4
- "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
- "backbone_model_type": "eagle",
6
- "model_revision": null,
7
- "tune_top_llm_layers": 4,
8
- "backbone_embedding_dim": 2048,
9
- "tune_llm": false,
10
- "tune_visual": false,
11
- "select_layer": 16,
12
- "reproject_vision": false,
13
- "use_flash_attention": true,
14
- "load_bf16": true,
15
- "collator_overwrite_image_inputs": false,
16
- "eagle_collator": true,
17
- "backbone_trainable_params_fp32": true,
18
- "extra_augmentation_config": null,
19
- "apply_sincos_state_encoding": true,
20
- "use_relative_action": true,
21
- "max_state_dim": 128,
22
- "max_action_dim": 128,
23
- "action_horizon": 50,
24
- "hidden_size": 1024,
25
- "input_embedding_dim": 1536,
26
- "add_pos_embed": true,
27
- "attn_dropout": 0.2,
28
- "use_vlln": true,
29
- "max_seq_len": 1024,
30
- "use_alternate_vl_dit": true,
31
- "attend_text_every_n_blocks": 2,
32
- "diffusion_model_cfg": {
33
- "attention_head_dim": 48,
34
- "dropout": 0.2,
35
- "final_dropout": true,
36
- "interleave_self_attention": true,
37
- "norm_type": "ada_norm",
38
- "num_attention_heads": 32,
39
- "num_layers": 32,
40
- "output_dim": 1024,
41
- "positional_embeddings": null
42
- },
43
- "num_inference_timesteps": 4,
44
- "noise_beta_alpha": 1.5,
45
- "noise_beta_beta": 1.0,
46
- "noise_s": 0.999,
47
- "num_timestep_buckets": 1000,
48
- "tune_projector": true,
49
- "tune_diffusion_model": true,
50
- "tune_vlln": true,
51
- "state_dropout_prob": 0.0,
52
- "state_additive_noise_scale": 0.0,
53
- "max_num_embodiments": 32
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-15000/experiment_cfg/final_processor_config.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-15000/model-00001-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b2b93b15f33bd8ea70ac7f6b11aa60c56788947162d6ae00dd1987457f54da3
3
- size 4990120184
 
 
 
 
checkpoint-15000/model-00002-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9be8fde5f5ace47b89e78e8cbea6a0b5494abb99ac6350affb66f53c7fc7c68
3
- size 4823190320
 
 
 
 
checkpoint-15000/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-15000/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:018072403772e39d83a35fb933559eaf88901e875bf632858002d7b78a84a344
3
- size 12960193762
 
 
 
 
checkpoint-15000/processor_config.json DELETED
@@ -1,455 +0,0 @@
1
- {
2
- "processor_class": "Gr00tN1d6Processor",
3
- "processor_kwargs": {
4
- "modality_configs": {
5
- "behavior_r1_pro": {
6
- "video": {
7
- "delta_indices": [
8
- 0
9
- ],
10
- "modality_keys": [
11
- "observation.images.rgb.head_256_256",
12
- "observation.images.rgb.left_wrist_256_256",
13
- "observation.images.rgb.right_wrist_256_256"
14
- ],
15
- "sin_cos_embedding_keys": null,
16
- "mean_std_embedding_keys": null,
17
- "action_configs": null
18
- },
19
- "state": {
20
- "delta_indices": [
21
- 0
22
- ],
23
- "modality_keys": [
24
- "robot_pos",
25
- "robot_ori_cos",
26
- "robot_ori_sin",
27
- "robot_2d_ori",
28
- "robot_2d_ori_cos",
29
- "robot_2d_ori_sin",
30
- "robot_lin_vel",
31
- "robot_ang_vel",
32
- "arm_left_qpos",
33
- "arm_left_qpos_sin",
34
- "arm_left_qpos_cos",
35
- "eef_left_pos",
36
- "eef_left_quat",
37
- "gripper_left_qpos",
38
- "arm_right_qpos",
39
- "arm_right_qpos_sin",
40
- "arm_right_qpos_cos",
41
- "eef_right_pos",
42
- "eef_right_quat",
43
- "gripper_right_qpos",
44
- "trunk_qpos"
45
- ],
46
- "sin_cos_embedding_keys": null,
47
- "mean_std_embedding_keys": null,
48
- "action_configs": null
49
- },
50
- "action": {
51
- "delta_indices": [
52
- 0,
53
- 1,
54
- 2,
55
- 3,
56
- 4,
57
- 5,
58
- 6,
59
- 7,
60
- 8,
61
- 9,
62
- 10,
63
- 11,
64
- 12,
65
- 13,
66
- 14,
67
- 15,
68
- 16,
69
- 17,
70
- 18,
71
- 19,
72
- 20,
73
- 21,
74
- 22,
75
- 23,
76
- 24,
77
- 25,
78
- 26,
79
- 27,
80
- 28,
81
- 29,
82
- 30,
83
- 31
84
- ],
85
- "modality_keys": [
86
- "base",
87
- "torso",
88
- "left_arm",
89
- "left_gripper",
90
- "right_arm",
91
- "right_gripper"
92
- ],
93
- "sin_cos_embedding_keys": null,
94
- "mean_std_embedding_keys": null,
95
- "action_configs": [
96
- {
97
- "rep": "ABSOLUTE",
98
- "type": "NON_EEF",
99
- "format": "DEFAULT",
100
- "state_key": null
101
- },
102
- {
103
- "rep": "RELATIVE",
104
- "type": "NON_EEF",
105
- "format": "DEFAULT",
106
- "state_key": "trunk_qpos"
107
- },
108
- {
109
- "rep": "RELATIVE",
110
- "type": "NON_EEF",
111
- "format": "DEFAULT",
112
- "state_key": "arm_left_qpos"
113
- },
114
- {
115
- "rep": "ABSOLUTE",
116
- "type": "NON_EEF",
117
- "format": "DEFAULT",
118
- "state_key": null
119
- },
120
- {
121
- "rep": "RELATIVE",
122
- "type": "NON_EEF",
123
- "format": "DEFAULT",
124
- "state_key": "arm_right_qpos"
125
- },
126
- {
127
- "rep": "ABSOLUTE",
128
- "type": "NON_EEF",
129
- "format": "DEFAULT",
130
- "state_key": null
131
- }
132
- ]
133
- },
134
- "language": {
135
- "delta_indices": [
136
- 0
137
- ],
138
- "modality_keys": [
139
- "annotation.human.coarse_action"
140
- ],
141
- "sin_cos_embedding_keys": null,
142
- "mean_std_embedding_keys": null,
143
- "action_configs": null
144
- }
145
- },
146
- "gr1": {
147
- "video": {
148
- "delta_indices": [
149
- 0
150
- ],
151
- "modality_keys": [
152
- "ego_view_bg_crop_pad_res256_freq20"
153
- ],
154
- "sin_cos_embedding_keys": null,
155
- "mean_std_embedding_keys": null,
156
- "action_configs": null
157
- },
158
- "state": {
159
- "delta_indices": [
160
- 0
161
- ],
162
- "modality_keys": [
163
- "left_arm",
164
- "right_arm",
165
- "left_hand",
166
- "right_hand",
167
- "waist"
168
- ],
169
- "sin_cos_embedding_keys": [
170
- "left_arm",
171
- "right_arm",
172
- "left_hand",
173
- "right_hand",
174
- "waist"
175
- ],
176
- "mean_std_embedding_keys": null,
177
- "action_configs": null
178
- },
179
- "action": {
180
- "delta_indices": [
181
- 0,
182
- 1,
183
- 2,
184
- 3,
185
- 4,
186
- 5,
187
- 6,
188
- 7,
189
- 8,
190
- 9,
191
- 10,
192
- 11,
193
- 12,
194
- 13,
195
- 14,
196
- 15
197
- ],
198
- "modality_keys": [
199
- "left_arm",
200
- "right_arm",
201
- "left_hand",
202
- "right_hand",
203
- "waist"
204
- ],
205
- "sin_cos_embedding_keys": null,
206
- "mean_std_embedding_keys": null,
207
- "action_configs": [
208
- {
209
- "rep": "RELATIVE",
210
- "type": "NON_EEF",
211
- "format": "DEFAULT",
212
- "state_key": null
213
- },
214
- {
215
- "rep": "RELATIVE",
216
- "type": "NON_EEF",
217
- "format": "DEFAULT",
218
- "state_key": null
219
- },
220
- {
221
- "rep": "RELATIVE",
222
- "type": "NON_EEF",
223
- "format": "DEFAULT",
224
- "state_key": null
225
- },
226
- {
227
- "rep": "RELATIVE",
228
- "type": "NON_EEF",
229
- "format": "DEFAULT",
230
- "state_key": null
231
- },
232
- {
233
- "rep": "ABSOLUTE",
234
- "type": "NON_EEF",
235
- "format": "DEFAULT",
236
- "state_key": null
237
- }
238
- ]
239
- },
240
- "language": {
241
- "delta_indices": [
242
- 0
243
- ],
244
- "modality_keys": [
245
- "task"
246
- ],
247
- "sin_cos_embedding_keys": null,
248
- "mean_std_embedding_keys": null,
249
- "action_configs": null
250
- }
251
- },
252
- "robocasa_panda_omron": {
253
- "video": {
254
- "delta_indices": [
255
- 0
256
- ],
257
- "modality_keys": [
258
- "res256_image_side_0",
259
- "res256_image_side_1",
260
- "res256_image_wrist_0"
261
- ],
262
- "sin_cos_embedding_keys": null,
263
- "mean_std_embedding_keys": null,
264
- "action_configs": null
265
- },
266
- "state": {
267
- "delta_indices": [
268
- 0
269
- ],
270
- "modality_keys": [
271
- "end_effector_position_relative",
272
- "end_effector_rotation_relative",
273
- "gripper_qpos",
274
- "base_position",
275
- "base_rotation"
276
- ],
277
- "sin_cos_embedding_keys": null,
278
- "mean_std_embedding_keys": null,
279
- "action_configs": null
280
- },
281
- "action": {
282
- "delta_indices": [
283
- 0,
284
- 1,
285
- 2,
286
- 3,
287
- 4,
288
- 5,
289
- 6,
290
- 7,
291
- 8,
292
- 9,
293
- 10,
294
- 11,
295
- 12,
296
- 13,
297
- 14,
298
- 15
299
- ],
300
- "modality_keys": [
301
- "end_effector_position",
302
- "end_effector_rotation",
303
- "gripper_close",
304
- "base_motion",
305
- "control_mode"
306
- ],
307
- "sin_cos_embedding_keys": null,
308
- "mean_std_embedding_keys": null,
309
- "action_configs": [
310
- {
311
- "rep": "ABSOLUTE",
312
- "type": "NON_EEF",
313
- "format": "DEFAULT",
314
- "state_key": null
315
- },
316
- {
317
- "rep": "ABSOLUTE",
318
- "type": "NON_EEF",
319
- "format": "DEFAULT",
320
- "state_key": null
321
- },
322
- {
323
- "rep": "ABSOLUTE",
324
- "type": "NON_EEF",
325
- "format": "DEFAULT",
326
- "state_key": null
327
- },
328
- {
329
- "rep": "ABSOLUTE",
330
- "type": "NON_EEF",
331
- "format": "DEFAULT",
332
- "state_key": null
333
- },
334
- {
335
- "rep": "ABSOLUTE",
336
- "type": "NON_EEF",
337
- "format": "DEFAULT",
338
- "state_key": null
339
- }
340
- ]
341
- },
342
- "language": {
343
- "delta_indices": [
344
- 0
345
- ],
346
- "modality_keys": [
347
- "annotation.human.action.task_description"
348
- ],
349
- "sin_cos_embedding_keys": null,
350
- "mean_std_embedding_keys": null,
351
- "action_configs": null
352
- }
353
- },
354
- "new_embodiment": {
355
- "video": {
356
- "delta_indices": [
357
- 0
358
- ],
359
- "modality_keys": [
360
- "cam_base",
361
- "cam_wrist"
362
- ],
363
- "sin_cos_embedding_keys": null,
364
- "mean_std_embedding_keys": null,
365
- "action_configs": null
366
- },
367
- "state": {
368
- "delta_indices": [
369
- 0
370
- ],
371
- "modality_keys": [
372
- "arm",
373
- "gripper"
374
- ],
375
- "sin_cos_embedding_keys": null,
376
- "mean_std_embedding_keys": null,
377
- "action_configs": null
378
- },
379
- "action": {
380
- "delta_indices": [
381
- 0,
382
- 1,
383
- 2,
384
- 3,
385
- 4,
386
- 5,
387
- 6,
388
- 7,
389
- 8,
390
- 9,
391
- 10,
392
- 11,
393
- 12,
394
- 13,
395
- 14,
396
- 15
397
- ],
398
- "modality_keys": [
399
- "arm",
400
- "gripper"
401
- ],
402
- "sin_cos_embedding_keys": null,
403
- "mean_std_embedding_keys": null,
404
- "action_configs": [
405
- {
406
- "rep": "ABSOLUTE",
407
- "type": "NON_EEF",
408
- "format": "DEFAULT",
409
- "state_key": null
410
- },
411
- {
412
- "rep": "ABSOLUTE",
413
- "type": "NON_EEF",
414
- "format": "DEFAULT",
415
- "state_key": null
416
- }
417
- ]
418
- },
419
- "language": {
420
- "delta_indices": [
421
- 0
422
- ],
423
- "modality_keys": [
424
- "annotation.human.task_description"
425
- ],
426
- "sin_cos_embedding_keys": null,
427
- "mean_std_embedding_keys": null,
428
- "action_configs": null
429
- }
430
- }
431
- },
432
- "image_crop_size": null,
433
- "image_target_size": null,
434
- "use_albumentations": true,
435
- "random_rotation_angle": null,
436
- "color_jitter_params": {
437
- "brightness": 0.3,
438
- "contrast": 0.4,
439
- "saturation": 0.5,
440
- "hue": 0.08
441
- },
442
- "shortest_image_edge": 256,
443
- "crop_fraction": 0.95,
444
- "model_name": "nvidia/Eagle-Block2A-2B-v2",
445
- "model_type": "eagle",
446
- "formalize_language": true,
447
- "max_state_dim": 128,
448
- "max_action_dim": 128,
449
- "max_action_horizon": 50,
450
- "use_percentiles": false,
451
- "clip_outliers": true,
452
- "apply_sincos_state_encoding": true,
453
- "use_relative_action": true
454
- }
455
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-15000/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:292427e64b7e1ca74d7fc55bdd4f7612064cd77b7dcb6cf568f0be95dfd5152b
3
- size 14645
 
 
 
 
checkpoint-15000/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee0ce25fd589eb0fbcffab63bcc2e2a86e5fb56630601b06b6bc1e425539b706
3
- size 1465
 
 
 
 
checkpoint-15000/statistics.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-15000/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-15000/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:49cf1bb9160fba423ccdd2dae7d9b40228772dd15a3fb17d44a4fd07f7818f45
3
- size 5713
 
 
 
 
checkpoint-15000/wandb_config.json DELETED
@@ -1 +0,0 @@
1
- {"project": "finetune-gr00t-n1d6", "run_id": "fr5_cherry"}
 
 
checkpoint-20000/config.json DELETED
@@ -1,70 +0,0 @@
1
- {
2
- "action_horizon": 50,
3
- "add_pos_embed": true,
4
- "apply_sincos_state_encoding": true,
5
- "architectures": [
6
- "Gr00tN1d6"
7
- ],
8
- "attn_dropout": 0.2,
9
- "attn_implementation": null,
10
- "backbone_embedding_dim": 2048,
11
- "backbone_model_type": "eagle",
12
- "backbone_trainable_params_fp32": true,
13
- "collator_overwrite_image_inputs": false,
14
- "color_jitter_params": {
15
- "brightness": 0.1,
16
- "contrast": 0.1,
17
- "hue": 0.1,
18
- "saturation": 0.1
19
- },
20
- "crop_fraction": 0.95,
21
- "diffusion_model_cfg": {
22
- "attention_head_dim": 48,
23
- "dropout": 0.2,
24
- "final_dropout": true,
25
- "interleave_self_attention": true,
26
- "norm_type": "ada_norm",
27
- "num_attention_heads": 32,
28
- "num_layers": 32,
29
- "output_dim": 1024,
30
- "positional_embeddings": null
31
- },
32
- "eagle_collator": true,
33
- "formalize_language": true,
34
- "gemma_collator": false,
35
- "hidden_size": 1024,
36
- "image_crop_size": null,
37
- "image_target_size": null,
38
- "input_embedding_dim": 1536,
39
- "load_bf16": true,
40
- "max_action_dim": 128,
41
- "max_num_embodiments": 32,
42
- "max_seq_len": 1024,
43
- "max_state_dim": 128,
44
- "model_dtype": "bfloat16",
45
- "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
- "model_type": "Gr00tN1d6",
47
- "noise_beta_alpha": 1.5,
48
- "noise_beta_beta": 1.0,
49
- "noise_s": 0.999,
50
- "num_inference_timesteps": 4,
51
- "num_timestep_buckets": 1000,
52
- "random_rotation_angle": null,
53
- "reproject_vision": false,
54
- "select_layer": 16,
55
- "shortest_image_edge": 256,
56
- "state_dropout_prob": 0.0,
57
- "torch_dtype": "bfloat16",
58
- "transformers_version": "4.51.3",
59
- "tune_diffusion_model": true,
60
- "tune_llm": false,
61
- "tune_projector": true,
62
- "tune_top_llm_layers": 4,
63
- "tune_visual": false,
64
- "tune_vlln": true,
65
- "use_albumentations_transforms": true,
66
- "use_alternate_vl_dit": true,
67
- "use_flash_attention": true,
68
- "use_relative_action": true,
69
- "use_vlln": true
70
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-20000/embodiment_id.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "robocasa_panda_omron": 13,
3
- "gr1": 20,
4
- "behavior_r1_pro": 24,
5
- "unitree_g1": 8,
6
- "oxe_google": 0,
7
- "oxe_widowx": 1,
8
- "libero_panda": 2,
9
- "oxe_droid": 16,
10
- "new_embodiment": 10
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-20000/experiment_cfg/conf.yaml DELETED
@@ -1,206 +0,0 @@
1
- load_config_path: null
2
- model:
3
- model_type: Gr00tN1d6
4
- model_dtype: bfloat16
5
- model_name: nvidia/Eagle-Block2A-2B-v2
6
- backbone_model_type: eagle
7
- model_revision: null
8
- tune_top_llm_layers: 4
9
- backbone_embedding_dim: 2048
10
- tune_llm: false
11
- tune_visual: false
12
- select_layer: 16
13
- reproject_vision: false
14
- use_flash_attention: true
15
- load_bf16: false
16
- collator_overwrite_image_inputs: false
17
- eagle_collator: true
18
- backbone_trainable_params_fp32: true
19
- image_crop_size: null
20
- image_target_size: null
21
- shortest_image_edge: 256
22
- crop_fraction: 0.95
23
- random_rotation_angle: null
24
- color_jitter_params: null
25
- use_albumentations_transforms: true
26
- extra_augmentation_config: null
27
- formalize_language: true
28
- apply_sincos_state_encoding: false
29
- use_relative_action: true
30
- max_state_dim: 29
31
- max_action_dim: 29
32
- action_horizon: 16
33
- hidden_size: 1024
34
- input_embedding_dim: 1536
35
- add_pos_embed: true
36
- attn_dropout: 0.2
37
- use_vlln: true
38
- max_seq_len: 1024
39
- use_alternate_vl_dit: true
40
- attend_text_every_n_blocks: 2
41
- diffusion_model_cfg:
42
- positional_embeddings: null
43
- num_layers: 32
44
- num_attention_heads: 32
45
- attention_head_dim: 48
46
- norm_type: ada_norm
47
- dropout: 0.2
48
- final_dropout: true
49
- output_dim: 1024
50
- interleave_self_attention: true
51
- num_inference_timesteps: 4
52
- noise_beta_alpha: 1.5
53
- noise_beta_beta: 1.0
54
- noise_s: 0.999
55
- num_timestep_buckets: 1000
56
- tune_projector: true
57
- tune_diffusion_model: true
58
- tune_vlln: true
59
- state_dropout_prob: 0.0
60
- state_additive_noise_scale: 0.0
61
- max_num_embodiments: 32
62
- data:
63
- datasets:
64
- - dataset_paths:
65
- - ./cherry_data
66
- embodiment_tag: new_embodiment
67
- mix_ratio: 1.0
68
- dataset_type: physical_embodiment
69
- val_dataset_path: null
70
- modality_configs:
71
- new_embodiment:
72
- video:
73
- delta_indices:
74
- - 0
75
- modality_keys:
76
- - cam_base
77
- - cam_wrist
78
- sin_cos_embedding_keys: null
79
- mean_std_embedding_keys: null
80
- action_configs: null
81
- state:
82
- delta_indices:
83
- - 0
84
- modality_keys:
85
- - arm
86
- - gripper
87
- sin_cos_embedding_keys: null
88
- mean_std_embedding_keys: null
89
- action_configs: null
90
- action:
91
- delta_indices:
92
- - 0
93
- - 1
94
- - 2
95
- - 3
96
- - 4
97
- - 5
98
- - 6
99
- - 7
100
- - 8
101
- - 9
102
- - 10
103
- - 11
104
- - 12
105
- - 13
106
- - 14
107
- - 15
108
- modality_keys:
109
- - arm
110
- - gripper
111
- sin_cos_embedding_keys: null
112
- mean_std_embedding_keys: null
113
- action_configs:
114
- - rep: ABSOLUTE
115
- type: NON_EEF
116
- format: DEFAULT
117
- state_key: null
118
- - rep: ABSOLUTE
119
- type: NON_EEF
120
- format: DEFAULT
121
- state_key: null
122
- language:
123
- delta_indices:
124
- - 0
125
- modality_keys:
126
- - annotation.human.task_description
127
- sin_cos_embedding_keys: null
128
- mean_std_embedding_keys: null
129
- action_configs: null
130
- download_cache: false
131
- shard_size: 1024
132
- episode_sampling_rate: 0.1
133
- num_shards_per_epoch: 100000
134
- override_pretraining_statistics: false
135
- mode: single_turn
136
- random_chop: 0.0
137
- mock_dataset_mode: false
138
- shuffle: true
139
- seed: 42
140
- multiprocessing_context: fork
141
- allow_padding: false
142
- subsample_ratio: 1.0
143
- image_crop_size:
144
- - 244
145
- - 244
146
- image_target_size:
147
- - 224
148
- - 224
149
- video_backend: decord
150
- training:
151
- output_dir: ./outputs/fr5_cherry
152
- experiment_name: null
153
- max_steps: 20000
154
- global_batch_size: 32
155
- batch_size: null
156
- gradient_accumulation_steps: 1
157
- learning_rate: 0.0001
158
- lr_scheduler_type: cosine
159
- weight_decay: 1.0e-05
160
- warmup_ratio: 0.05
161
- warmup_steps: 0
162
- max_grad_norm: 1.0
163
- optim: adamw_torch
164
- start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
165
- tf32: true
166
- fp16: false
167
- bf16: true
168
- eval_bf16: true
169
- logging_steps: 10
170
- save_steps: 5000
171
- save_total_limit: 5
172
- save_vl_model: false
173
- upload_checkpoints: false
174
- upload_every: 1000
175
- upload_last_n_checkpoints: 5
176
- max_concurrent_uploads: 2
177
- eval_strategy: 'no'
178
- eval_steps: 500
179
- eval_set_split_ratio: 0.1
180
- eval_batch_size: 2
181
- save_best_eval_metric_name: ''
182
- save_best_eval_metric_greater_is_better: true
183
- deepspeed_stage: 2
184
- gradient_checkpointing: false
185
- transformers_trust_remote_code: true
186
- transformers_local_files_only: false
187
- transformers_cache_dir: null
188
- transformers_access_token: null
189
- use_ddp: false
190
- ddp_bucket_cap_mb: 100
191
- num_gpus: 1
192
- dataloader_num_workers: 2
193
- remove_unused_columns: false
194
- use_wandb: false
195
- wandb_project: finetune-gr00t-n1d6
196
- enable_profiling: false
197
- max_retries: 3
198
- assert_loss_less_than: null
199
- add_rl_callback: false
200
- enable_open_loop_eval: false
201
- open_loop_eval_traj_ids:
202
- - 0
203
- open_loop_eval_steps_per_traj: 100
204
- open_loop_eval_plot_indices: null
205
- max_steps: 20000
206
- save_steps: 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-20000/experiment_cfg/config.yaml DELETED
@@ -1,239 +0,0 @@
1
- !!python/object:gr00t.configs.base_config.Config
2
- data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
- allow_padding: false
4
- datasets:
5
- - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
- dataset_paths:
7
- - ./cherry_data
8
- dataset_type: physical_embodiment
9
- embodiment_tag: new_embodiment
10
- mix_ratio: 1.0
11
- val_dataset_path: null
12
- download_cache: false
13
- episode_sampling_rate: 0.1
14
- image_crop_size:
15
- - 244
16
- - 244
17
- image_target_size:
18
- - 224
19
- - 224
20
- mock_dataset_mode: false
21
- modality_configs:
22
- new_embodiment:
23
- action: !!python/object:gr00t.data.types.ModalityConfig
24
- action_configs:
25
- - !!python/object:gr00t.data.types.ActionConfig
26
- format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
27
- - default
28
- rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
29
- - absolute
30
- state_key: null
31
- type: &id003 !!python/object/apply:gr00t.data.types.ActionType
32
- - non_eef
33
- - !!python/object:gr00t.data.types.ActionConfig
34
- format: *id001
35
- rep: *id002
36
- state_key: null
37
- type: *id003
38
- delta_indices:
39
- - 0
40
- - 1
41
- - 2
42
- - 3
43
- - 4
44
- - 5
45
- - 6
46
- - 7
47
- - 8
48
- - 9
49
- - 10
50
- - 11
51
- - 12
52
- - 13
53
- - 14
54
- - 15
55
- mean_std_embedding_keys: null
56
- modality_keys:
57
- - arm
58
- - gripper
59
- sin_cos_embedding_keys: null
60
- language: !!python/object:gr00t.data.types.ModalityConfig
61
- action_configs: null
62
- delta_indices:
63
- - 0
64
- mean_std_embedding_keys: null
65
- modality_keys:
66
- - annotation.human.task_description
67
- sin_cos_embedding_keys: null
68
- state: !!python/object:gr00t.data.types.ModalityConfig
69
- action_configs: null
70
- delta_indices:
71
- - 0
72
- mean_std_embedding_keys: null
73
- modality_keys:
74
- - arm
75
- - gripper
76
- sin_cos_embedding_keys: null
77
- video: !!python/object:gr00t.data.types.ModalityConfig
78
- action_configs: null
79
- delta_indices:
80
- - 0
81
- mean_std_embedding_keys: null
82
- modality_keys:
83
- - cam_base
84
- - cam_wrist
85
- sin_cos_embedding_keys: null
86
- mode: single_turn
87
- multiprocessing_context: fork
88
- num_shards_per_epoch: 100000
89
- override_pretraining_statistics: false
90
- random_chop: 0.0
91
- seed: 42
92
- shard_size: 1024
93
- shuffle: true
94
- subsample_ratio: 1.0
95
- video_backend: decord
96
- load_config_path: null
97
- model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
98
- _attn_implementation_autoset: false
99
- _attn_implementation_internal: null
100
- _commit_hash: null
101
- _name_or_path: ''
102
- add_cross_attention: false
103
- architectures: null
104
- backbone_model_type: eagle
105
- backbone_trainable_params_fp32: true
106
- bad_words_ids: null
107
- begin_suppress_tokens: null
108
- bos_token_id: null
109
- chunk_size_feed_forward: 0
110
- color_jitter_params: null
111
- cross_attention_hidden_size: null
112
- decoder_start_token_id: null
113
- diffusion_model_cfg:
114
- attention_head_dim: 48
115
- dropout: 0.2
116
- final_dropout: true
117
- interleave_self_attention: true
118
- norm_type: ada_norm
119
- num_attention_heads: 32
120
- num_layers: 32
121
- output_dim: 1024
122
- positional_embeddings: null
123
- diversity_penalty: 0.0
124
- do_sample: false
125
- eagle_collator: true
126
- early_stopping: false
127
- encoder_no_repeat_ngram_size: 0
128
- eos_token_id: null
129
- exponential_decay_length_penalty: null
130
- extra_augmentation_config: null
131
- finetuning_task: null
132
- forced_bos_token_id: null
133
- forced_eos_token_id: null
134
- id2label:
135
- 0: LABEL_0
136
- 1: LABEL_1
137
- is_decoder: false
138
- is_encoder_decoder: false
139
- label2id:
140
- LABEL_0: 0
141
- LABEL_1: 1
142
- length_penalty: 1.0
143
- load_bf16: false
144
- max_length: 20
145
- min_length: 0
146
- model_name: nvidia/Eagle-Block2A-2B-v2
147
- no_repeat_ngram_size: 0
148
- num_beam_groups: 1
149
- num_beams: 1
150
- num_return_sequences: 1
151
- output_attentions: false
152
- output_hidden_states: false
153
- output_scores: false
154
- pad_token_id: null
155
- prefix: null
156
- problem_type: null
157
- pruned_heads: {}
158
- random_rotation_angle: null
159
- remove_invalid_values: false
160
- repetition_penalty: 1.0
161
- reproject_vision: false
162
- return_dict: true
163
- return_dict_in_generate: false
164
- sep_token_id: null
165
- state_dropout_prob: 0.0
166
- suppress_tokens: null
167
- task_specific_params: null
168
- temperature: 1.0
169
- tf_legacy_loss: false
170
- tie_encoder_decoder: false
171
- tie_word_embeddings: true
172
- tokenizer_class: null
173
- top_k: 50
174
- top_p: 1.0
175
- torch_dtype: null
176
- torchscript: false
177
- transformers_version: null
178
- tune_diffusion_model: true
179
- tune_llm: false
180
- tune_projector: true
181
- tune_visual: false
182
- typical_p: 1.0
183
- use_bfloat16: false
184
- use_relative_action: true
185
- training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
186
- add_rl_callback: false
187
- assert_loss_less_than: null
188
- batch_size: null
189
- bf16: true
190
- dataloader_num_workers: 2
191
- ddp_bucket_cap_mb: 100
192
- deepspeed_stage: 2
193
- enable_open_loop_eval: false
194
- enable_profiling: false
195
- eval_batch_size: 2
196
- eval_bf16: true
197
- eval_set_split_ratio: 0.1
198
- eval_steps: 500
199
- eval_strategy: 'no'
200
- experiment_name: null
201
- fp16: false
202
- global_batch_size: 32
203
- gradient_accumulation_steps: 1
204
- gradient_checkpointing: false
205
- learning_rate: 0.0001
206
- logging_steps: 10
207
- lr_scheduler_type: cosine
208
- max_concurrent_uploads: 2
209
- max_grad_norm: 1.0
210
- max_retries: 3
211
- max_steps: 20000
212
- num_gpus: 1
213
- open_loop_eval_plot_indices: null
214
- open_loop_eval_steps_per_traj: 100
215
- open_loop_eval_traj_ids:
216
- - 0
217
- optim: adamw_torch
218
- output_dir: ./outputs/fr5_cherry
219
- remove_unused_columns: false
220
- save_best_eval_metric_greater_is_better: true
221
- save_best_eval_metric_name: ''
222
- save_steps: 5000
223
- save_total_limit: 5
224
- save_vl_model: false
225
- start_from_checkpoint: ./pretrained_models/GR00T-N1.6-3B
226
- tf32: true
227
- transformers_access_token: null
228
- transformers_cache_dir: null
229
- transformers_local_files_only: false
230
- transformers_trust_remote_code: true
231
- upload_checkpoints: false
232
- upload_every: 1000
233
- upload_last_n_checkpoints: 5
234
- use_ddp: false
235
- use_wandb: false
236
- wandb_project: finetune-gr00t-n1d6
237
- warmup_ratio: 0.05
238
- warmup_steps: 0
239
- weight_decay: 1.0e-05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-20000/experiment_cfg/dataset_statistics.json DELETED
@@ -1,149 +0,0 @@
1
- {
2
- "new_embodiment": {
3
- "state": {
4
- "arm": {
5
- "min": [
6
- 0.4818978011608124,
7
- -1.687173843383789,
8
- 0.62826007604599,
9
- -2.6761701107025146,
10
- -1.8431425094604492,
11
- -0.5678880214691162
12
- ],
13
- "max": [
14
- 0.7935351133346558,
15
- -1.014952301979065,
16
- 1.8637524843215942,
17
- -1.0820374488830566,
18
- -1.4455490112304688,
19
- 0.3115537762641907
20
- ],
21
- "mean": [
22
- 0.6489784717559814,
23
- -1.3269319534301758,
24
- 1.356391429901123,
25
- -1.804563045501709,
26
- -1.619696021080017,
27
- -0.07974076271057129
28
- ],
29
- "std": [
30
- 0.053538445383310186,
31
- 0.1604488044977188,
32
- 0.2438623011112213,
33
- 0.22075510025024414,
34
- 0.07333532720804177,
35
- 0.10092151165008545
36
- ],
37
- "q01": [
38
- 0.5197953635454178,
39
- -1.6432996988296509,
40
- 0.8626433879137039,
41
- -2.5542680168151857,
42
- -1.8000394713878631,
43
- -0.37301090329885483
44
- ],
45
- "q99": [
46
- 0.7509180748462676,
47
- -1.0879072868824005,
48
- 1.7959050333499906,
49
- -1.252977850437165,
50
- -1.4705305182933812,
51
- 0.2933953133225437
52
- ]
53
- },
54
- "gripper": {
55
- "min": [
56
- 0.0
57
- ],
58
- "max": [
59
- 1.0
60
- ],
61
- "mean": [
62
- 0.7650123238563538
63
- ],
64
- "std": [
65
- 0.39907386898994446
66
- ],
67
- "q01": [
68
- 0.0
69
- ],
70
- "q99": [
71
- 1.0
72
- ]
73
- }
74
- },
75
- "action": {
76
- "arm": {
77
- "min": [
78
- 0.4818978011608124,
79
- -1.687173843383789,
80
- 0.62826007604599,
81
- -2.6573522090911865,
82
- -1.8431425094604492,
83
- -0.5678880214691162
84
- ],
85
- "max": [
86
- 0.7935351133346558,
87
- -1.014952301979065,
88
- 1.8637524843215942,
89
- -1.0820374488830566,
90
- -1.4455490112304688,
91
- 0.3115537762641907
92
- ],
93
- "mean": [
94
- 0.6489997506141663,
95
- -1.326717495918274,
96
- 1.355955958366394,
97
- -1.8026670217514038,
98
- -1.6199865341186523,
99
- -0.07982920855283737
100
- ],
101
- "std": [
102
- 0.05358240380883204,
103
- 0.16021256148815155,
104
- 0.243374302983284,
105
- 0.2178075611591339,
106
- 0.07321629673242531,
107
- 0.10097639262676239
108
- ],
109
- "q01": [
110
- 0.5197953635454178,
111
- -1.6432996988296509,
112
- 0.8625765931606293,
113
- -2.53433034658432,
114
- -1.8000823378562927,
115
- -0.37301090329885483
116
- ],
117
- "q99": [
118
- 0.7509180748462676,
119
- -1.0879072868824005,
120
- 1.7849992513656616,
121
- -1.2526323044300085,
122
- -1.4705633461475374,
123
- 0.2933953133225437
124
- ]
125
- },
126
- "gripper": {
127
- "min": [
128
- 0.0
129
- ],
130
- "max": [
131
- 1.0
132
- ],
133
- "mean": [
134
- 0.7650123238563538
135
- ],
136
- "std": [
137
- 0.39907386898994446
138
- ],
139
- "q01": [
140
- 0.0
141
- ],
142
- "q99": [
143
- 1.0
144
- ]
145
- }
146
- },
147
- "relative_action": {}
148
- }
149
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-20000/experiment_cfg/final_model_config.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "model_type": "Gr00tN1d6",
3
- "model_dtype": "bfloat16",
4
- "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
- "backbone_model_type": "eagle",
6
- "model_revision": null,
7
- "tune_top_llm_layers": 4,
8
- "backbone_embedding_dim": 2048,
9
- "tune_llm": false,
10
- "tune_visual": false,
11
- "select_layer": 16,
12
- "reproject_vision": false,
13
- "use_flash_attention": true,
14
- "load_bf16": true,
15
- "collator_overwrite_image_inputs": false,
16
- "eagle_collator": true,
17
- "backbone_trainable_params_fp32": true,
18
- "extra_augmentation_config": null,
19
- "apply_sincos_state_encoding": true,
20
- "use_relative_action": true,
21
- "max_state_dim": 128,
22
- "max_action_dim": 128,
23
- "action_horizon": 50,
24
- "hidden_size": 1024,
25
- "input_embedding_dim": 1536,
26
- "add_pos_embed": true,
27
- "attn_dropout": 0.2,
28
- "use_vlln": true,
29
- "max_seq_len": 1024,
30
- "use_alternate_vl_dit": true,
31
- "attend_text_every_n_blocks": 2,
32
- "diffusion_model_cfg": {
33
- "attention_head_dim": 48,
34
- "dropout": 0.2,
35
- "final_dropout": true,
36
- "interleave_self_attention": true,
37
- "norm_type": "ada_norm",
38
- "num_attention_heads": 32,
39
- "num_layers": 32,
40
- "output_dim": 1024,
41
- "positional_embeddings": null
42
- },
43
- "num_inference_timesteps": 4,
44
- "noise_beta_alpha": 1.5,
45
- "noise_beta_beta": 1.0,
46
- "noise_s": 0.999,
47
- "num_timestep_buckets": 1000,
48
- "tune_projector": true,
49
- "tune_diffusion_model": true,
50
- "tune_vlln": true,
51
- "state_dropout_prob": 0.0,
52
- "state_additive_noise_scale": 0.0,
53
- "max_num_embodiments": 32
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-20000/experiment_cfg/final_processor_config.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/model-00001-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e567773ecb7739580072e23d2cf32c20483faa087d934138f85f3e9f717ef54c
3
- size 4990120184
 
 
 
 
checkpoint-20000/model-00002-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a82b8434c44a16ff46ed3ac65a9082e15ef327779100bf4e7ac0633f694dc1dc
3
- size 4823190320
 
 
 
 
checkpoint-20000/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4808f6a3470dddc484424c6c3648b3b4b808e96ea886a0505ea84dcea913924
3
- size 12960193762
 
 
 
 
checkpoint-20000/processor_config.json DELETED
@@ -1,455 +0,0 @@
1
- {
2
- "processor_class": "Gr00tN1d6Processor",
3
- "processor_kwargs": {
4
- "modality_configs": {
5
- "behavior_r1_pro": {
6
- "video": {
7
- "delta_indices": [
8
- 0
9
- ],
10
- "modality_keys": [
11
- "observation.images.rgb.head_256_256",
12
- "observation.images.rgb.left_wrist_256_256",
13
- "observation.images.rgb.right_wrist_256_256"
14
- ],
15
- "sin_cos_embedding_keys": null,
16
- "mean_std_embedding_keys": null,
17
- "action_configs": null
18
- },
19
- "state": {
20
- "delta_indices": [
21
- 0
22
- ],
23
- "modality_keys": [
24
- "robot_pos",
25
- "robot_ori_cos",
26
- "robot_ori_sin",
27
- "robot_2d_ori",
28
- "robot_2d_ori_cos",
29
- "robot_2d_ori_sin",
30
- "robot_lin_vel",
31
- "robot_ang_vel",
32
- "arm_left_qpos",
33
- "arm_left_qpos_sin",
34
- "arm_left_qpos_cos",
35
- "eef_left_pos",
36
- "eef_left_quat",
37
- "gripper_left_qpos",
38
- "arm_right_qpos",
39
- "arm_right_qpos_sin",
40
- "arm_right_qpos_cos",
41
- "eef_right_pos",
42
- "eef_right_quat",
43
- "gripper_right_qpos",
44
- "trunk_qpos"
45
- ],
46
- "sin_cos_embedding_keys": null,
47
- "mean_std_embedding_keys": null,
48
- "action_configs": null
49
- },
50
- "action": {
51
- "delta_indices": [
52
- 0,
53
- 1,
54
- 2,
55
- 3,
56
- 4,
57
- 5,
58
- 6,
59
- 7,
60
- 8,
61
- 9,
62
- 10,
63
- 11,
64
- 12,
65
- 13,
66
- 14,
67
- 15,
68
- 16,
69
- 17,
70
- 18,
71
- 19,
72
- 20,
73
- 21,
74
- 22,
75
- 23,
76
- 24,
77
- 25,
78
- 26,
79
- 27,
80
- 28,
81
- 29,
82
- 30,
83
- 31
84
- ],
85
- "modality_keys": [
86
- "base",
87
- "torso",
88
- "left_arm",
89
- "left_gripper",
90
- "right_arm",
91
- "right_gripper"
92
- ],
93
- "sin_cos_embedding_keys": null,
94
- "mean_std_embedding_keys": null,
95
- "action_configs": [
96
- {
97
- "rep": "ABSOLUTE",
98
- "type": "NON_EEF",
99
- "format": "DEFAULT",
100
- "state_key": null
101
- },
102
- {
103
- "rep": "RELATIVE",
104
- "type": "NON_EEF",
105
- "format": "DEFAULT",
106
- "state_key": "trunk_qpos"
107
- },
108
- {
109
- "rep": "RELATIVE",
110
- "type": "NON_EEF",
111
- "format": "DEFAULT",
112
- "state_key": "arm_left_qpos"
113
- },
114
- {
115
- "rep": "ABSOLUTE",
116
- "type": "NON_EEF",
117
- "format": "DEFAULT",
118
- "state_key": null
119
- },
120
- {
121
- "rep": "RELATIVE",
122
- "type": "NON_EEF",
123
- "format": "DEFAULT",
124
- "state_key": "arm_right_qpos"
125
- },
126
- {
127
- "rep": "ABSOLUTE",
128
- "type": "NON_EEF",
129
- "format": "DEFAULT",
130
- "state_key": null
131
- }
132
- ]
133
- },
134
- "language": {
135
- "delta_indices": [
136
- 0
137
- ],
138
- "modality_keys": [
139
- "annotation.human.coarse_action"
140
- ],
141
- "sin_cos_embedding_keys": null,
142
- "mean_std_embedding_keys": null,
143
- "action_configs": null
144
- }
145
- },
146
- "gr1": {
147
- "video": {
148
- "delta_indices": [
149
- 0
150
- ],
151
- "modality_keys": [
152
- "ego_view_bg_crop_pad_res256_freq20"
153
- ],
154
- "sin_cos_embedding_keys": null,
155
- "mean_std_embedding_keys": null,
156
- "action_configs": null
157
- },
158
- "state": {
159
- "delta_indices": [
160
- 0
161
- ],
162
- "modality_keys": [
163
- "left_arm",
164
- "right_arm",
165
- "left_hand",
166
- "right_hand",
167
- "waist"
168
- ],
169
- "sin_cos_embedding_keys": [
170
- "left_arm",
171
- "right_arm",
172
- "left_hand",
173
- "right_hand",
174
- "waist"
175
- ],
176
- "mean_std_embedding_keys": null,
177
- "action_configs": null
178
- },
179
- "action": {
180
- "delta_indices": [
181
- 0,
182
- 1,
183
- 2,
184
- 3,
185
- 4,
186
- 5,
187
- 6,
188
- 7,
189
- 8,
190
- 9,
191
- 10,
192
- 11,
193
- 12,
194
- 13,
195
- 14,
196
- 15
197
- ],
198
- "modality_keys": [
199
- "left_arm",
200
- "right_arm",
201
- "left_hand",
202
- "right_hand",
203
- "waist"
204
- ],
205
- "sin_cos_embedding_keys": null,
206
- "mean_std_embedding_keys": null,
207
- "action_configs": [
208
- {
209
- "rep": "RELATIVE",
210
- "type": "NON_EEF",
211
- "format": "DEFAULT",
212
- "state_key": null
213
- },
214
- {
215
- "rep": "RELATIVE",
216
- "type": "NON_EEF",
217
- "format": "DEFAULT",
218
- "state_key": null
219
- },
220
- {
221
- "rep": "RELATIVE",
222
- "type": "NON_EEF",
223
- "format": "DEFAULT",
224
- "state_key": null
225
- },
226
- {
227
- "rep": "RELATIVE",
228
- "type": "NON_EEF",
229
- "format": "DEFAULT",
230
- "state_key": null
231
- },
232
- {
233
- "rep": "ABSOLUTE",
234
- "type": "NON_EEF",
235
- "format": "DEFAULT",
236
- "state_key": null
237
- }
238
- ]
239
- },
240
- "language": {
241
- "delta_indices": [
242
- 0
243
- ],
244
- "modality_keys": [
245
- "task"
246
- ],
247
- "sin_cos_embedding_keys": null,
248
- "mean_std_embedding_keys": null,
249
- "action_configs": null
250
- }
251
- },
252
- "robocasa_panda_omron": {
253
- "video": {
254
- "delta_indices": [
255
- 0
256
- ],
257
- "modality_keys": [
258
- "res256_image_side_0",
259
- "res256_image_side_1",
260
- "res256_image_wrist_0"
261
- ],
262
- "sin_cos_embedding_keys": null,
263
- "mean_std_embedding_keys": null,
264
- "action_configs": null
265
- },
266
- "state": {
267
- "delta_indices": [
268
- 0
269
- ],
270
- "modality_keys": [
271
- "end_effector_position_relative",
272
- "end_effector_rotation_relative",
273
- "gripper_qpos",
274
- "base_position",
275
- "base_rotation"
276
- ],
277
- "sin_cos_embedding_keys": null,
278
- "mean_std_embedding_keys": null,
279
- "action_configs": null
280
- },
281
- "action": {
282
- "delta_indices": [
283
- 0,
284
- 1,
285
- 2,
286
- 3,
287
- 4,
288
- 5,
289
- 6,
290
- 7,
291
- 8,
292
- 9,
293
- 10,
294
- 11,
295
- 12,
296
- 13,
297
- 14,
298
- 15
299
- ],
300
- "modality_keys": [
301
- "end_effector_position",
302
- "end_effector_rotation",
303
- "gripper_close",
304
- "base_motion",
305
- "control_mode"
306
- ],
307
- "sin_cos_embedding_keys": null,
308
- "mean_std_embedding_keys": null,
309
- "action_configs": [
310
- {
311
- "rep": "ABSOLUTE",
312
- "type": "NON_EEF",
313
- "format": "DEFAULT",
314
- "state_key": null
315
- },
316
- {
317
- "rep": "ABSOLUTE",
318
- "type": "NON_EEF",
319
- "format": "DEFAULT",
320
- "state_key": null
321
- },
322
- {
323
- "rep": "ABSOLUTE",
324
- "type": "NON_EEF",
325
- "format": "DEFAULT",
326
- "state_key": null
327
- },
328
- {
329
- "rep": "ABSOLUTE",
330
- "type": "NON_EEF",
331
- "format": "DEFAULT",
332
- "state_key": null
333
- },
334
- {
335
- "rep": "ABSOLUTE",
336
- "type": "NON_EEF",
337
- "format": "DEFAULT",
338
- "state_key": null
339
- }
340
- ]
341
- },
342
- "language": {
343
- "delta_indices": [
344
- 0
345
- ],
346
- "modality_keys": [
347
- "annotation.human.action.task_description"
348
- ],
349
- "sin_cos_embedding_keys": null,
350
- "mean_std_embedding_keys": null,
351
- "action_configs": null
352
- }
353
- },
354
- "new_embodiment": {
355
- "video": {
356
- "delta_indices": [
357
- 0
358
- ],
359
- "modality_keys": [
360
- "cam_base",
361
- "cam_wrist"
362
- ],
363
- "sin_cos_embedding_keys": null,
364
- "mean_std_embedding_keys": null,
365
- "action_configs": null
366
- },
367
- "state": {
368
- "delta_indices": [
369
- 0
370
- ],
371
- "modality_keys": [
372
- "arm",
373
- "gripper"
374
- ],
375
- "sin_cos_embedding_keys": null,
376
- "mean_std_embedding_keys": null,
377
- "action_configs": null
378
- },
379
- "action": {
380
- "delta_indices": [
381
- 0,
382
- 1,
383
- 2,
384
- 3,
385
- 4,
386
- 5,
387
- 6,
388
- 7,
389
- 8,
390
- 9,
391
- 10,
392
- 11,
393
- 12,
394
- 13,
395
- 14,
396
- 15
397
- ],
398
- "modality_keys": [
399
- "arm",
400
- "gripper"
401
- ],
402
- "sin_cos_embedding_keys": null,
403
- "mean_std_embedding_keys": null,
404
- "action_configs": [
405
- {
406
- "rep": "ABSOLUTE",
407
- "type": "NON_EEF",
408
- "format": "DEFAULT",
409
- "state_key": null
410
- },
411
- {
412
- "rep": "ABSOLUTE",
413
- "type": "NON_EEF",
414
- "format": "DEFAULT",
415
- "state_key": null
416
- }
417
- ]
418
- },
419
- "language": {
420
- "delta_indices": [
421
- 0
422
- ],
423
- "modality_keys": [
424
- "annotation.human.task_description"
425
- ],
426
- "sin_cos_embedding_keys": null,
427
- "mean_std_embedding_keys": null,
428
- "action_configs": null
429
- }
430
- }
431
- },
432
- "image_crop_size": null,
433
- "image_target_size": null,
434
- "use_albumentations": true,
435
- "random_rotation_angle": null,
436
- "color_jitter_params": {
437
- "brightness": 0.3,
438
- "contrast": 0.4,
439
- "saturation": 0.5,
440
- "hue": 0.08
441
- },
442
- "shortest_image_edge": 256,
443
- "crop_fraction": 0.95,
444
- "model_name": "nvidia/Eagle-Block2A-2B-v2",
445
- "model_type": "eagle",
446
- "formalize_language": true,
447
- "max_state_dim": 128,
448
- "max_action_dim": 128,
449
- "max_action_horizon": 50,
450
- "use_percentiles": false,
451
- "clip_outliers": true,
452
- "apply_sincos_state_encoding": true,
453
- "use_relative_action": true
454
- }
455
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-20000/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:675751298697509e109188026494482b1da89c72d7a1ba3abec2e498516d2755
3
- size 14645
 
 
 
 
checkpoint-20000/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fa802a80def971b73ec74284a6aa44d0b2ea101bd38ed41a3b1c1a0b4001f00
3
- size 1465