owen-burns commited on
Commit
981ceec
·
verified ·
1 Parent(s): 3befd08

Add files using upload-large-folder tool

Browse files
checkpoint-20000/config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_bos_embedding": true,
4
+ "add_pos_embed": true,
5
+ "apply_sincos_state_encoding": true,
6
+ "architectures": [
7
+ "Gr00tN1d6"
8
+ ],
9
+ "attn_dropout": 0.2,
10
+ "attn_implementation": null,
11
+ "backbone_embedding_dim": 2048,
12
+ "backbone_model_type": "eagle",
13
+ "backbone_trainable_params_fp32": true,
14
+ "block_size": 256,
15
+ "collator_overwrite_image_inputs": false,
16
+ "color_jitter_params": {
17
+ "brightness": 0.1,
18
+ "contrast": 0.1,
19
+ "hue": 0.1,
20
+ "saturation": 0.1
21
+ },
22
+ "crop_fraction": 0.95,
23
+ "decay_frequency": 1,
24
+ "diffusion_model_cfg": {
25
+ "attention_head_dim": 48,
26
+ "dropout": 0.2,
27
+ "final_dropout": true,
28
+ "interleave_self_attention": true,
29
+ "norm_type": "ada_norm",
30
+ "num_attention_heads": 32,
31
+ "num_layers": 32,
32
+ "output_dim": 1024,
33
+ "positional_embeddings": null
34
+ },
35
+ "drop_ltm_memory": true,
36
+ "eagle_collator": true,
37
+ "formalize_language": true,
38
+ "gemma_collator": false,
39
+ "hidden_size": 1024,
40
+ "image_crop_size": null,
41
+ "image_target_size": null,
42
+ "initial_recall_frequency": 10,
43
+ "input_embedding_dim": 1536,
44
+ "load_bf16": true,
45
+ "ltm_retrieve_blocks": 10,
46
+ "max_action_dim": 128,
47
+ "max_num_embodiments": 32,
48
+ "max_seq_len": 1024,
49
+ "max_state_dim": 128,
50
+ "model_dtype": "bfloat16",
51
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
52
+ "model_type": "Gr00tN1d6",
53
+ "noise_beta_alpha": 1.5,
54
+ "noise_beta_beta": 1.0,
55
+ "noise_s": 0.999,
56
+ "num_inference_timesteps": 4,
57
+ "num_ltm_blocks": 10,
58
+ "num_selector_layers": 2,
59
+ "num_timestep_buckets": 1000,
60
+ "num_total_blocks": 50,
61
+ "random_retriever_length": false,
62
+ "random_rotation_angle": null,
63
+ "reproject_vision": false,
64
+ "select_layer": 16,
65
+ "selector_hidden_dim": 256,
66
+ "shortest_image_edge": 256,
67
+ "state_dropout_prob": 0.0,
68
+ "torch_dtype": "bfloat16",
69
+ "transformers_version": "4.53.2",
70
+ "tune_diffusion_model": false,
71
+ "tune_enc_query_proj": false,
72
+ "tune_key_proj": false,
73
+ "tune_llm": false,
74
+ "tune_memory_bos_embedding": false,
75
+ "tune_memory_loras": false,
76
+ "tune_memory_pos_embeddings": false,
77
+ "tune_projector": true,
78
+ "tune_query_proj": false,
79
+ "tune_top_llm_layers": 4,
80
+ "tune_visual": false,
81
+ "tune_vlln": false,
82
+ "use_albumentations_transforms": true,
83
+ "use_alternate_vl_dit": true,
84
+ "use_flash_attention": true,
85
+ "use_relative_action": true,
86
+ "use_vlln": true
87
+ }
checkpoint-20000/embodiment_id.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "new_embodiment": 10
10
+ }
checkpoint-20000/experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d6
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Eagle-Block2A-2B-v2
6
+ backbone_model_type: eagle
7
+ model_revision: null
8
+ tune_top_llm_layers: 0
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 16
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ collator_overwrite_image_inputs: false
17
+ eagle_collator: true
18
+ backbone_trainable_params_fp32: true
19
+ image_crop_size: null
20
+ image_target_size: null
21
+ shortest_image_edge: 256
22
+ crop_fraction: 0.95
23
+ random_rotation_angle: null
24
+ color_jitter_params: null
25
+ use_albumentations_transforms: true
26
+ formalize_language: true
27
+ apply_sincos_state_encoding: false
28
+ use_relative_action: true
29
+ max_state_dim: 29
30
+ max_action_dim: 29
31
+ action_horizon: 16
32
+ hidden_size: 1024
33
+ input_embedding_dim: 1536
34
+ add_pos_embed: true
35
+ attn_dropout: 0.2
36
+ use_vlln: true
37
+ max_seq_len: 1024
38
+ use_alternate_vl_dit: true
39
+ attend_text_every_n_blocks: 2
40
+ diffusion_model_cfg:
41
+ positional_embeddings: null
42
+ num_layers: 32
43
+ num_attention_heads: 32
44
+ attention_head_dim: 48
45
+ norm_type: ada_norm
46
+ dropout: 0.2
47
+ final_dropout: true
48
+ output_dim: 1024
49
+ interleave_self_attention: true
50
+ num_inference_timesteps: 4
51
+ noise_beta_alpha: 1.5
52
+ noise_beta_beta: 1.0
53
+ noise_s: 0.999
54
+ num_timestep_buckets: 1000
55
+ tune_projector: true
56
+ tune_diffusion_model: false
57
+ tune_vlln: false
58
+ state_dropout_prob: 0.0
59
+ state_additive_noise_scale: 0.0
60
+ max_num_embodiments: 32
61
+ data:
62
+ datasets:
63
+ - dataset_paths:
64
+ - /workspace/gr00t/examples/LIBERO/libero_goal_no_noops_1.0.0_lerobot
65
+ embodiment_tag: libero_panda
66
+ mix_ratio: 1.0
67
+ dataset_type: physical_embodiment
68
+ val_dataset_path: null
69
+ video_backend: torchcodec
70
+ - dataset_paths:
71
+ - /workspace/gr00t/examples/LIBERO/libero_10_no_noops_1.0.0_lerobot
72
+ embodiment_tag: libero_panda
73
+ mix_ratio: 1.0
74
+ dataset_type: physical_embodiment
75
+ val_dataset_path: null
76
+ video_backend: torchcodec
77
+ - dataset_paths:
78
+ - /workspace/gr00t/examples/LIBERO/libero_object_no_noops_1.0.0_lerobot
79
+ embodiment_tag: libero_panda
80
+ mix_ratio: 1.0
81
+ dataset_type: physical_embodiment
82
+ val_dataset_path: null
83
+ video_backend: torchcodec
84
+ - dataset_paths:
85
+ - /workspace/gr00t/examples/LIBERO/libero_spatial_no_noops_1.0.0_lerobot
86
+ embodiment_tag: libero_panda
87
+ mix_ratio: 1.0
88
+ dataset_type: physical_embodiment
89
+ val_dataset_path: null
90
+ video_backend: torchcodec
91
+ - dataset_paths:
92
+ - /workspace/gr00t/examples/SimplerEnv/bridge_orig_lerobot/
93
+ embodiment_tag: oxe_widowx
94
+ mix_ratio: 1.0
95
+ dataset_type: physical_embodiment
96
+ val_dataset_path: null
97
+ video_backend: torchcodec
98
+ - dataset_paths:
99
+ - /workspace/gr00t/examples/SimplerEnv/fractal20220817_data_lerobot/
100
+ embodiment_tag: oxe_google
101
+ mix_ratio: 1.0
102
+ dataset_type: physical_embodiment
103
+ val_dataset_path: null
104
+ video_backend: torchcodec
105
+ - dataset_paths:
106
+ - /workspace/gr00t/examples/GR00T-WholeBodyControl/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim/unitree_g1.LMPnPAppleToPlateDC/
107
+ embodiment_tag: unitree_g1
108
+ mix_ratio: 1.0
109
+ dataset_type: physical_embodiment
110
+ val_dataset_path: null
111
+ video_backend: torchcodec
112
+ modality_configs:
113
+ oxe_google:
114
+ video:
115
+ delta_indices:
116
+ - 0
117
+ modality_keys:
118
+ - image
119
+ sin_cos_embedding_keys: null
120
+ mean_std_embedding_keys: null
121
+ action_configs: null
122
+ state:
123
+ delta_indices:
124
+ - 0
125
+ modality_keys:
126
+ - x
127
+ - 'y'
128
+ - z
129
+ - rx
130
+ - ry
131
+ - rz
132
+ - rw
133
+ - gripper
134
+ sin_cos_embedding_keys: null
135
+ mean_std_embedding_keys: null
136
+ action_configs: null
137
+ action:
138
+ delta_indices:
139
+ - 0
140
+ - 1
141
+ - 2
142
+ - 3
143
+ - 4
144
+ - 5
145
+ - 6
146
+ - 7
147
+ modality_keys:
148
+ - x
149
+ - 'y'
150
+ - z
151
+ - roll
152
+ - pitch
153
+ - yaw
154
+ - gripper
155
+ sin_cos_embedding_keys: null
156
+ mean_std_embedding_keys:
157
+ - x
158
+ - 'y'
159
+ - z
160
+ - roll
161
+ - pitch
162
+ - yaw
163
+ action_configs:
164
+ - rep: ABSOLUTE
165
+ type: NON_EEF
166
+ format: DEFAULT
167
+ state_key: null
168
+ - rep: ABSOLUTE
169
+ type: NON_EEF
170
+ format: DEFAULT
171
+ state_key: null
172
+ - rep: ABSOLUTE
173
+ type: NON_EEF
174
+ format: DEFAULT
175
+ state_key: null
176
+ - rep: ABSOLUTE
177
+ type: NON_EEF
178
+ format: DEFAULT
179
+ state_key: null
180
+ - rep: ABSOLUTE
181
+ type: NON_EEF
182
+ format: DEFAULT
183
+ state_key: null
184
+ - rep: ABSOLUTE
185
+ type: NON_EEF
186
+ format: DEFAULT
187
+ state_key: null
188
+ - rep: ABSOLUTE
189
+ type: NON_EEF
190
+ format: DEFAULT
191
+ state_key: null
192
+ language:
193
+ delta_indices:
194
+ - 0
195
+ modality_keys:
196
+ - annotation.human.action.task_description
197
+ sin_cos_embedding_keys: null
198
+ mean_std_embedding_keys: null
199
+ action_configs: null
200
+ unitree_g1:
201
+ video:
202
+ delta_indices:
203
+ - 0
204
+ modality_keys:
205
+ - ego_view
206
+ sin_cos_embedding_keys: null
207
+ mean_std_embedding_keys: null
208
+ action_configs: null
209
+ state:
210
+ delta_indices:
211
+ - 0
212
+ modality_keys:
213
+ - left_leg
214
+ - right_leg
215
+ - waist
216
+ - left_arm
217
+ - right_arm
218
+ - left_hand
219
+ - right_hand
220
+ sin_cos_embedding_keys: null
221
+ mean_std_embedding_keys: null
222
+ action_configs: null
223
+ action:
224
+ delta_indices:
225
+ - 0
226
+ - 1
227
+ - 2
228
+ - 3
229
+ - 4
230
+ - 5
231
+ - 6
232
+ - 7
233
+ - 8
234
+ - 9
235
+ - 10
236
+ - 11
237
+ - 12
238
+ - 13
239
+ - 14
240
+ - 15
241
+ - 16
242
+ - 17
243
+ - 18
244
+ - 19
245
+ - 20
246
+ - 21
247
+ - 22
248
+ - 23
249
+ - 24
250
+ - 25
251
+ - 26
252
+ - 27
253
+ - 28
254
+ - 29
255
+ modality_keys:
256
+ - left_arm
257
+ - right_arm
258
+ - left_hand
259
+ - right_hand
260
+ - waist
261
+ - base_height_command
262
+ - navigate_command
263
+ sin_cos_embedding_keys: null
264
+ mean_std_embedding_keys: null
265
+ action_configs:
266
+ - rep: RELATIVE
267
+ type: NON_EEF
268
+ format: DEFAULT
269
+ state_key: null
270
+ - rep: RELATIVE
271
+ type: NON_EEF
272
+ format: DEFAULT
273
+ state_key: null
274
+ - rep: ABSOLUTE
275
+ type: NON_EEF
276
+ format: DEFAULT
277
+ state_key: null
278
+ - rep: ABSOLUTE
279
+ type: NON_EEF
280
+ format: DEFAULT
281
+ state_key: null
282
+ - rep: ABSOLUTE
283
+ type: NON_EEF
284
+ format: DEFAULT
285
+ state_key: null
286
+ - rep: ABSOLUTE
287
+ type: NON_EEF
288
+ format: DEFAULT
289
+ state_key: null
290
+ - rep: ABSOLUTE
291
+ type: NON_EEF
292
+ format: DEFAULT
293
+ state_key: null
294
+ language:
295
+ delta_indices:
296
+ - 0
297
+ modality_keys:
298
+ - annotation.human.task_description
299
+ sin_cos_embedding_keys: null
300
+ mean_std_embedding_keys: null
301
+ action_configs: null
302
+ oxe_widowx:
303
+ video:
304
+ delta_indices:
305
+ - 0
306
+ modality_keys:
307
+ - image_0
308
+ sin_cos_embedding_keys: null
309
+ mean_std_embedding_keys: null
310
+ action_configs: null
311
+ state:
312
+ delta_indices:
313
+ - 0
314
+ modality_keys:
315
+ - x
316
+ - 'y'
317
+ - z
318
+ - roll
319
+ - pitch
320
+ - yaw
321
+ - pad
322
+ - gripper
323
+ sin_cos_embedding_keys: null
324
+ mean_std_embedding_keys: null
325
+ action_configs: null
326
+ action:
327
+ delta_indices:
328
+ - 0
329
+ - 1
330
+ - 2
331
+ - 3
332
+ - 4
333
+ - 5
334
+ - 6
335
+ - 7
336
+ modality_keys:
337
+ - x
338
+ - 'y'
339
+ - z
340
+ - roll
341
+ - pitch
342
+ - yaw
343
+ - gripper
344
+ sin_cos_embedding_keys: null
345
+ mean_std_embedding_keys:
346
+ - x
347
+ - 'y'
348
+ - z
349
+ - roll
350
+ - pitch
351
+ - yaw
352
+ action_configs:
353
+ - rep: ABSOLUTE
354
+ type: NON_EEF
355
+ format: DEFAULT
356
+ state_key: null
357
+ - rep: ABSOLUTE
358
+ type: NON_EEF
359
+ format: DEFAULT
360
+ state_key: null
361
+ - rep: ABSOLUTE
362
+ type: NON_EEF
363
+ format: DEFAULT
364
+ state_key: null
365
+ - rep: ABSOLUTE
366
+ type: NON_EEF
367
+ format: DEFAULT
368
+ state_key: null
369
+ - rep: ABSOLUTE
370
+ type: NON_EEF
371
+ format: DEFAULT
372
+ state_key: null
373
+ - rep: ABSOLUTE
374
+ type: NON_EEF
375
+ format: DEFAULT
376
+ state_key: null
377
+ - rep: ABSOLUTE
378
+ type: NON_EEF
379
+ format: DEFAULT
380
+ state_key: null
381
+ language:
382
+ delta_indices:
383
+ - 0
384
+ modality_keys:
385
+ - annotation.human.action.task_description
386
+ sin_cos_embedding_keys: null
387
+ mean_std_embedding_keys: null
388
+ action_configs: null
389
+ libero_panda:
390
+ video:
391
+ delta_indices:
392
+ - 0
393
+ modality_keys:
394
+ - image
395
+ - wrist_image
396
+ sin_cos_embedding_keys: null
397
+ mean_std_embedding_keys: null
398
+ action_configs: null
399
+ state:
400
+ delta_indices:
401
+ - 0
402
+ modality_keys:
403
+ - x
404
+ - 'y'
405
+ - z
406
+ - roll
407
+ - pitch
408
+ - yaw
409
+ - gripper
410
+ sin_cos_embedding_keys: null
411
+ mean_std_embedding_keys: null
412
+ action_configs: null
413
+ action:
414
+ delta_indices:
415
+ - 0
416
+ - 1
417
+ - 2
418
+ - 3
419
+ - 4
420
+ - 5
421
+ - 6
422
+ - 7
423
+ - 8
424
+ - 9
425
+ - 10
426
+ - 11
427
+ - 12
428
+ - 13
429
+ - 14
430
+ - 15
431
+ modality_keys:
432
+ - x
433
+ - 'y'
434
+ - z
435
+ - roll
436
+ - pitch
437
+ - yaw
438
+ - gripper
439
+ sin_cos_embedding_keys: null
440
+ mean_std_embedding_keys: null
441
+ action_configs:
442
+ - rep: ABSOLUTE
443
+ type: NON_EEF
444
+ format: DEFAULT
445
+ state_key: null
446
+ - rep: ABSOLUTE
447
+ type: NON_EEF
448
+ format: DEFAULT
449
+ state_key: null
450
+ - rep: ABSOLUTE
451
+ type: NON_EEF
452
+ format: DEFAULT
453
+ state_key: null
454
+ - rep: ABSOLUTE
455
+ type: NON_EEF
456
+ format: DEFAULT
457
+ state_key: null
458
+ - rep: ABSOLUTE
459
+ type: NON_EEF
460
+ format: DEFAULT
461
+ state_key: null
462
+ - rep: ABSOLUTE
463
+ type: NON_EEF
464
+ format: DEFAULT
465
+ state_key: null
466
+ - rep: ABSOLUTE
467
+ type: NON_EEF
468
+ format: DEFAULT
469
+ state_key: null
470
+ language:
471
+ delta_indices:
472
+ - 0
473
+ modality_keys:
474
+ - annotation.human.action.task_description
475
+ sin_cos_embedding_keys: null
476
+ mean_std_embedding_keys: null
477
+ action_configs: null
478
+ download_cache: false
479
+ shard_size: 1024
480
+ episode_sampling_rate: 0.1
481
+ num_shards_per_epoch: 100000
482
+ override_pretraining_statistics: false
483
+ mode: scenario
484
+ random_chop: 0.0
485
+ mock_dataset_mode: false
486
+ shuffle: true
487
+ seed: 42
488
+ multiprocessing_context: fork
489
+ allow_padding: false
490
+ subsample_ratio: 1.0
491
+ image_crop_size:
492
+ - 244
493
+ - 244
494
+ image_target_size:
495
+ - 224
496
+ - 224
497
+ video_backend: torchcodec
498
+ training:
499
+ output_dir: models/attempt_2/stage_1_2/
500
+ experiment_name: null
501
+ max_steps: 20000
502
+ global_batch_size: 12
503
+ batch_size: null
504
+ gradient_accumulation_steps: 1
505
+ learning_rate: 0.0001
506
+ lr_scheduler_type: cosine
507
+ weight_decay: 1.0e-05
508
+ warmup_ratio: 0.05
509
+ warmup_steps: 0
510
+ max_grad_norm: 1.0
511
+ optim: adamw_torch_fused
512
+ start_from_checkpoint: models/attempt_2/stage_1/checkpoint-12000/
513
+ tf32: true
514
+ fp16: false
515
+ bf16: true
516
+ eval_bf16: true
517
+ logging_steps: 10
518
+ save_steps: 2000
519
+ save_total_limit: 5
520
+ save_vl_model: false
521
+ upload_checkpoints: false
522
+ upload_every: 1000
523
+ upload_last_n_checkpoints: 5
524
+ max_concurrent_uploads: 2
525
+ eval_strategy: 'no'
526
+ eval_steps: 500
527
+ eval_set_split_ratio: 0.1
528
+ eval_batch_size: 2
529
+ save_best_eval_metric_name: ''
530
+ save_best_eval_metric_greater_is_better: true
531
+ deepspeed_stage: 2
532
+ gradient_checkpointing: false
533
+ transformers_trust_remote_code: true
534
+ transformers_local_files_only: false
535
+ transformers_cache_dir: null
536
+ transformers_access_token: null
537
+ use_ddp: false
538
+ ddp_bucket_cap_mb: 100
539
+ num_gpus: 1
540
+ dataloader_num_workers: 9
541
+ remove_unused_columns: false
542
+ use_wandb: true
543
+ wandb_project: finetune-gr00t-n1d6
544
+ enable_profiling: false
545
+ max_retries: 3
546
+ assert_loss_less_than: null
547
+ add_rl_callback: false
548
+ enable_open_loop_eval: false
549
+ open_loop_eval_traj_ids:
550
+ - 0
551
+ open_loop_eval_steps_per_traj: 100
552
+ open_loop_eval_plot_indices: null
553
+ scenario: gr00t.experiment_configs.stage_1.Stage1Dataset
554
+ experiment: gr00t.experiment_configs.stage_1.Stage1Experiment
555
+ max_steps: 20000
556
+ save_steps: 2000
checkpoint-20000/experiment_cfg/config.yaml ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.MemoryExperimentConfig
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - /workspace/gr00t/examples/LIBERO/libero_goal_no_noops_1.0.0_lerobot
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: libero_panda
10
+ mix_ratio: 1.0
11
+ val_dataset_path: null
12
+ video_backend: torchcodec
13
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
14
+ dataset_paths:
15
+ - /workspace/gr00t/examples/LIBERO/libero_10_no_noops_1.0.0_lerobot
16
+ dataset_type: physical_embodiment
17
+ embodiment_tag: libero_panda
18
+ mix_ratio: 1.0
19
+ val_dataset_path: null
20
+ video_backend: torchcodec
21
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
22
+ dataset_paths:
23
+ - /workspace/gr00t/examples/LIBERO/libero_object_no_noops_1.0.0_lerobot
24
+ dataset_type: physical_embodiment
25
+ embodiment_tag: libero_panda
26
+ mix_ratio: 1.0
27
+ val_dataset_path: null
28
+ video_backend: torchcodec
29
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
30
+ dataset_paths:
31
+ - /workspace/gr00t/examples/LIBERO/libero_spatial_no_noops_1.0.0_lerobot
32
+ dataset_type: physical_embodiment
33
+ embodiment_tag: libero_panda
34
+ mix_ratio: 1.0
35
+ val_dataset_path: null
36
+ video_backend: torchcodec
37
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
38
+ dataset_paths:
39
+ - /workspace/gr00t/examples/SimplerEnv/bridge_orig_lerobot/
40
+ dataset_type: physical_embodiment
41
+ embodiment_tag: oxe_widowx
42
+ mix_ratio: 1.0
43
+ val_dataset_path: null
44
+ video_backend: torchcodec
45
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
46
+ dataset_paths:
47
+ - /workspace/gr00t/examples/SimplerEnv/fractal20220817_data_lerobot/
48
+ dataset_type: physical_embodiment
49
+ embodiment_tag: oxe_google
50
+ mix_ratio: 1.0
51
+ val_dataset_path: null
52
+ video_backend: torchcodec
53
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
54
+ dataset_paths:
55
+ - /workspace/gr00t/examples/GR00T-WholeBodyControl/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim/unitree_g1.LMPnPAppleToPlateDC/
56
+ dataset_type: physical_embodiment
57
+ embodiment_tag: unitree_g1
58
+ mix_ratio: 1.0
59
+ val_dataset_path: null
60
+ video_backend: torchcodec
61
+ download_cache: false
62
+ episode_sampling_rate: 0.1
63
+ image_crop_size:
64
+ - 244
65
+ - 244
66
+ image_target_size:
67
+ - 224
68
+ - 224
69
+ mock_dataset_mode: false
70
+ modality_configs:
71
+ libero_panda:
72
+ action: !!python/object:gr00t.data.types.ModalityConfig
73
+ action_configs:
74
+ - &id001 !!python/object:gr00t.data.types.ActionConfig
75
+ format: &id002 !!python/object/apply:gr00t.data.types.ActionFormat
76
+ - default
77
+ rep: &id003 !!python/object/apply:gr00t.data.types.ActionRepresentation
78
+ - absolute
79
+ state_key: null
80
+ type: &id004 !!python/object/apply:gr00t.data.types.ActionType
81
+ - non_eef
82
+ - *id001
83
+ - *id001
84
+ - *id001
85
+ - *id001
86
+ - *id001
87
+ - *id001
88
+ delta_indices:
89
+ - 0
90
+ - 1
91
+ - 2
92
+ - 3
93
+ - 4
94
+ - 5
95
+ - 6
96
+ - 7
97
+ - 8
98
+ - 9
99
+ - 10
100
+ - 11
101
+ - 12
102
+ - 13
103
+ - 14
104
+ - 15
105
+ mean_std_embedding_keys: null
106
+ modality_keys:
107
+ - x
108
+ - y
109
+ - z
110
+ - roll
111
+ - pitch
112
+ - yaw
113
+ - gripper
114
+ sin_cos_embedding_keys: null
115
+ language: !!python/object:gr00t.data.types.ModalityConfig
116
+ action_configs: null
117
+ delta_indices:
118
+ - 0
119
+ mean_std_embedding_keys: null
120
+ modality_keys:
121
+ - annotation.human.action.task_description
122
+ sin_cos_embedding_keys: null
123
+ state: !!python/object:gr00t.data.types.ModalityConfig
124
+ action_configs: null
125
+ delta_indices:
126
+ - 0
127
+ mean_std_embedding_keys: null
128
+ modality_keys:
129
+ - x
130
+ - y
131
+ - z
132
+ - roll
133
+ - pitch
134
+ - yaw
135
+ - gripper
136
+ sin_cos_embedding_keys: null
137
+ video: !!python/object:gr00t.data.types.ModalityConfig
138
+ action_configs: null
139
+ delta_indices:
140
+ - 0
141
+ mean_std_embedding_keys: null
142
+ modality_keys:
143
+ - image
144
+ - wrist_image
145
+ sin_cos_embedding_keys: null
146
+ oxe_google:
147
+ action: !!python/object:gr00t.data.types.ModalityConfig
148
+ action_configs:
149
+ - &id005 !!python/object:gr00t.data.types.ActionConfig
150
+ format: *id002
151
+ rep: *id003
152
+ state_key: null
153
+ type: *id004
154
+ - *id005
155
+ - *id005
156
+ - *id005
157
+ - *id005
158
+ - *id005
159
+ - *id005
160
+ delta_indices:
161
+ - 0
162
+ - 1
163
+ - 2
164
+ - 3
165
+ - 4
166
+ - 5
167
+ - 6
168
+ - 7
169
+ mean_std_embedding_keys:
170
+ - x
171
+ - y
172
+ - z
173
+ - roll
174
+ - pitch
175
+ - yaw
176
+ modality_keys:
177
+ - x
178
+ - y
179
+ - z
180
+ - roll
181
+ - pitch
182
+ - yaw
183
+ - gripper
184
+ sin_cos_embedding_keys: null
185
+ language: !!python/object:gr00t.data.types.ModalityConfig
186
+ action_configs: null
187
+ delta_indices:
188
+ - 0
189
+ mean_std_embedding_keys: null
190
+ modality_keys:
191
+ - annotation.human.action.task_description
192
+ sin_cos_embedding_keys: null
193
+ state: !!python/object:gr00t.data.types.ModalityConfig
194
+ action_configs: null
195
+ delta_indices:
196
+ - 0
197
+ mean_std_embedding_keys: null
198
+ modality_keys:
199
+ - x
200
+ - y
201
+ - z
202
+ - rx
203
+ - ry
204
+ - rz
205
+ - rw
206
+ - gripper
207
+ sin_cos_embedding_keys: null
208
+ video: !!python/object:gr00t.data.types.ModalityConfig
209
+ action_configs: null
210
+ delta_indices:
211
+ - 0
212
+ mean_std_embedding_keys: null
213
+ modality_keys:
214
+ - image
215
+ sin_cos_embedding_keys: null
216
+ oxe_widowx:
217
+ action: !!python/object:gr00t.data.types.ModalityConfig
218
+ action_configs:
219
+ - &id006 !!python/object:gr00t.data.types.ActionConfig
220
+ format: *id002
221
+ rep: *id003
222
+ state_key: null
223
+ type: *id004
224
+ - *id006
225
+ - *id006
226
+ - *id006
227
+ - *id006
228
+ - *id006
229
+ - *id006
230
+ delta_indices:
231
+ - 0
232
+ - 1
233
+ - 2
234
+ - 3
235
+ - 4
236
+ - 5
237
+ - 6
238
+ - 7
239
+ mean_std_embedding_keys:
240
+ - x
241
+ - y
242
+ - z
243
+ - roll
244
+ - pitch
245
+ - yaw
246
+ modality_keys:
247
+ - x
248
+ - y
249
+ - z
250
+ - roll
251
+ - pitch
252
+ - yaw
253
+ - gripper
254
+ sin_cos_embedding_keys: null
255
+ language: !!python/object:gr00t.data.types.ModalityConfig
256
+ action_configs: null
257
+ delta_indices:
258
+ - 0
259
+ mean_std_embedding_keys: null
260
+ modality_keys:
261
+ - annotation.human.action.task_description
262
+ sin_cos_embedding_keys: null
263
+ state: !!python/object:gr00t.data.types.ModalityConfig
264
+ action_configs: null
265
+ delta_indices:
266
+ - 0
267
+ mean_std_embedding_keys: null
268
+ modality_keys:
269
+ - x
270
+ - y
271
+ - z
272
+ - roll
273
+ - pitch
274
+ - yaw
275
+ - pad
276
+ - gripper
277
+ sin_cos_embedding_keys: null
278
+ video: !!python/object:gr00t.data.types.ModalityConfig
279
+ action_configs: null
280
+ delta_indices:
281
+ - 0
282
+ mean_std_embedding_keys: null
283
+ modality_keys:
284
+ - image_0
285
+ sin_cos_embedding_keys: null
286
+ unitree_g1:
287
+ action: !!python/object:gr00t.data.types.ModalityConfig
288
+ action_configs:
289
+ - !!python/object:gr00t.data.types.ActionConfig
290
+ format: *id002
291
+ rep: &id007 !!python/object/apply:gr00t.data.types.ActionRepresentation
292
+ - relative
293
+ state_key: null
294
+ type: *id004
295
+ - !!python/object:gr00t.data.types.ActionConfig
296
+ format: *id002
297
+ rep: *id007
298
+ state_key: null
299
+ type: *id004
300
+ - !!python/object:gr00t.data.types.ActionConfig
301
+ format: *id002
302
+ rep: *id003
303
+ state_key: null
304
+ type: *id004
305
+ - !!python/object:gr00t.data.types.ActionConfig
306
+ format: *id002
307
+ rep: *id003
308
+ state_key: null
309
+ type: *id004
310
+ - !!python/object:gr00t.data.types.ActionConfig
311
+ format: *id002
312
+ rep: *id003
313
+ state_key: null
314
+ type: *id004
315
+ - !!python/object:gr00t.data.types.ActionConfig
316
+ format: *id002
317
+ rep: *id003
318
+ state_key: null
319
+ type: *id004
320
+ - !!python/object:gr00t.data.types.ActionConfig
321
+ format: *id002
322
+ rep: *id003
323
+ state_key: null
324
+ type: *id004
325
+ delta_indices:
326
+ - 0
327
+ - 1
328
+ - 2
329
+ - 3
330
+ - 4
331
+ - 5
332
+ - 6
333
+ - 7
334
+ - 8
335
+ - 9
336
+ - 10
337
+ - 11
338
+ - 12
339
+ - 13
340
+ - 14
341
+ - 15
342
+ - 16
343
+ - 17
344
+ - 18
345
+ - 19
346
+ - 20
347
+ - 21
348
+ - 22
349
+ - 23
350
+ - 24
351
+ - 25
352
+ - 26
353
+ - 27
354
+ - 28
355
+ - 29
356
+ mean_std_embedding_keys: null
357
+ modality_keys:
358
+ - left_arm
359
+ - right_arm
360
+ - left_hand
361
+ - right_hand
362
+ - waist
363
+ - base_height_command
364
+ - navigate_command
365
+ sin_cos_embedding_keys: null
366
+ language: !!python/object:gr00t.data.types.ModalityConfig
367
+ action_configs: null
368
+ delta_indices:
369
+ - 0
370
+ mean_std_embedding_keys: null
371
+ modality_keys:
372
+ - annotation.human.task_description
373
+ sin_cos_embedding_keys: null
374
+ state: !!python/object:gr00t.data.types.ModalityConfig
375
+ action_configs: null
376
+ delta_indices:
377
+ - 0
378
+ mean_std_embedding_keys: null
379
+ modality_keys:
380
+ - left_leg
381
+ - right_leg
382
+ - waist
383
+ - left_arm
384
+ - right_arm
385
+ - left_hand
386
+ - right_hand
387
+ sin_cos_embedding_keys: null
388
+ video: !!python/object:gr00t.data.types.ModalityConfig
389
+ action_configs: null
390
+ delta_indices:
391
+ - 0
392
+ mean_std_embedding_keys: null
393
+ modality_keys:
394
+ - ego_view
395
+ sin_cos_embedding_keys: null
396
+ mode: scenario
397
+ multiprocessing_context: fork
398
+ num_shards_per_epoch: 100000
399
+ override_pretraining_statistics: false
400
+ random_chop: 0.0
401
+ seed: 42
402
+ shard_size: 1024
403
+ shuffle: true
404
+ subsample_ratio: 1.0
405
+ video_backend: torchcodec
406
+ experiment: gr00t.experiment_configs.stage_1.Stage1Experiment
407
+ load_config_path: null
408
+ model: !!python/object:gr00t.configs.model.gr00t_n1d6.MemoryGr00tN1d6Config
409
+ _attn_implementation_autoset: false
410
+ _attn_implementation_internal: null
411
+ _commit_hash: null
412
+ _name_or_path: ''
413
+ _output_attentions: false
414
+ add_bos_embedding: true
415
+ add_cross_attention: false
416
+ architectures: null
417
+ backbone_model_type: eagle
418
+ backbone_trainable_params_fp32: true
419
+ bad_words_ids: null
420
+ begin_suppress_tokens: null
421
+ block_size: 256
422
+ bos_token_id: null
423
+ chunk_size_feed_forward: 0
424
+ color_jitter_params: null
425
+ cross_attention_hidden_size: null
426
+ decay_frequency: 1
427
+ decoder_start_token_id: null
428
+ diffusion_model_cfg:
429
+ attention_head_dim: 48
430
+ dropout: 0.2
431
+ final_dropout: true
432
+ interleave_self_attention: true
433
+ norm_type: ada_norm
434
+ num_attention_heads: 32
435
+ num_layers: 32
436
+ output_dim: 1024
437
+ positional_embeddings: null
438
+ diversity_penalty: 0.0
439
+ do_sample: false
440
+ drop_ltm_memory: true
441
+ eagle_collator: true
442
+ early_stopping: false
443
+ encoder_no_repeat_ngram_size: 0
444
+ eos_token_id: null
445
+ exponential_decay_length_penalty: null
446
+ finetuning_task: null
447
+ forced_bos_token_id: null
448
+ forced_eos_token_id: null
449
+ id2label:
450
+ 0: LABEL_0
451
+ 1: LABEL_1
452
+ initial_recall_frequency: 10
453
+ is_decoder: false
454
+ is_encoder_decoder: false
455
+ label2id:
456
+ LABEL_0: 0
457
+ LABEL_1: 1
458
+ length_penalty: 1.0
459
+ load_bf16: false
460
+ ltm_retrieve_blocks: 10
461
+ max_length: 20
462
+ min_length: 0
463
+ model_name: nvidia/Eagle-Block2A-2B-v2
464
+ no_repeat_ngram_size: 0
465
+ num_beam_groups: 1
466
+ num_beams: 1
467
+ num_ltm_blocks: 10
468
+ num_return_sequences: 1
469
+ num_selector_layers: 2
470
+ num_total_blocks: 50
471
+ output_hidden_states: false
472
+ output_scores: false
473
+ pad_token_id: null
474
+ prefix: null
475
+ problem_type: null
476
+ pruned_heads: {}
477
+ random_retriever_length: false
478
+ random_rotation_angle: null
479
+ remove_invalid_values: false
480
+ repetition_penalty: 1.0
481
+ reproject_vision: false
482
+ return_dict: true
483
+ return_dict_in_generate: false
484
+ selector_hidden_dim: 256
485
+ sep_token_id: null
486
+ state_dropout_prob: 0.0
487
+ suppress_tokens: null
488
+ task_specific_params: null
489
+ temperature: 1.0
490
+ tf_legacy_loss: false
491
+ tie_encoder_decoder: false
492
+ tie_word_embeddings: true
493
+ tokenizer_class: null
494
+ top_k: 50
495
+ top_p: 1.0
496
+ torch_dtype: null
497
+ torchscript: false
498
+ transformers_version: null
499
+ tune_diffusion_model: false
500
+ tune_enc_query_proj: false
501
+ tune_key_proj: false
502
+ tune_llm: false
503
+ tune_memory_bos_embedding: false
504
+ tune_memory_loras: false
505
+ tune_memory_pos_embeddings: false
506
+ tune_projector: true
507
+ tune_query_proj: false
508
+ tune_top_llm_layers: 0
509
+ tune_visual: false
510
+ tune_vlln: false
511
+ typical_p: 1.0
512
+ use_bfloat16: false
513
+ use_relative_action: true
514
+ scenario: gr00t.experiment_configs.stage_1.Stage1Dataset
515
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
516
+ add_rl_callback: false
517
+ assert_loss_less_than: null
518
+ batch_size: null
519
+ bf16: true
520
+ dataloader_num_workers: 9
521
+ ddp_bucket_cap_mb: 100
522
+ deepspeed_stage: 2
523
+ enable_open_loop_eval: false
524
+ enable_profiling: false
525
+ eval_batch_size: 2
526
+ eval_bf16: true
527
+ eval_set_split_ratio: 0.1
528
+ eval_steps: 500
529
+ eval_strategy: 'no'
530
+ experiment_name: null
531
+ fp16: false
532
+ global_batch_size: 12
533
+ gradient_accumulation_steps: 1
534
+ gradient_checkpointing: false
535
+ learning_rate: 0.0001
536
+ logging_steps: 10
537
+ lr_scheduler_type: cosine
538
+ max_concurrent_uploads: 2
539
+ max_grad_norm: 1.0
540
+ max_retries: 3
541
+ max_steps: 20000
542
+ num_gpus: 1
543
+ open_loop_eval_plot_indices: null
544
+ open_loop_eval_steps_per_traj: 100
545
+ open_loop_eval_traj_ids:
546
+ - 0
547
+ optim: adamw_torch_fused
548
+ output_dir: models/attempt_2/stage_1_2/
549
+ remove_unused_columns: false
550
+ save_best_eval_metric_greater_is_better: true
551
+ save_best_eval_metric_name: ''
552
+ save_steps: 2000
553
+ save_total_limit: 5
554
+ save_vl_model: false
555
+ start_from_checkpoint: models/attempt_2/stage_1/checkpoint-12000/
556
+ tf32: true
557
+ transformers_access_token: null
558
+ transformers_cache_dir: null
559
+ transformers_local_files_only: false
560
+ transformers_trust_remote_code: true
561
+ upload_checkpoints: false
562
+ upload_every: 1000
563
+ upload_last_n_checkpoints: 5
564
+ use_ddp: false
565
+ use_wandb: true
566
+ wandb_project: finetune-gr00t-n1d6
567
+ warmup_ratio: 0.05
568
+ warmup_steps: 0
569
+ weight_decay: 1.0e-05
checkpoint-20000/experiment_cfg/dataset_statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d6",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
+ "backbone_model_type": "eagle",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 4,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": true,
15
+ "collator_overwrite_image_inputs": false,
16
+ "eagle_collator": true,
17
+ "backbone_trainable_params_fp32": true,
18
+ "apply_sincos_state_encoding": true,
19
+ "use_relative_action": true,
20
+ "max_state_dim": 128,
21
+ "max_action_dim": 128,
22
+ "action_horizon": 50,
23
+ "hidden_size": 1024,
24
+ "input_embedding_dim": 1536,
25
+ "add_pos_embed": true,
26
+ "attn_dropout": 0.2,
27
+ "use_vlln": true,
28
+ "max_seq_len": 1024,
29
+ "use_alternate_vl_dit": true,
30
+ "attend_text_every_n_blocks": 2,
31
+ "diffusion_model_cfg": {
32
+ "attention_head_dim": 48,
33
+ "dropout": 0.2,
34
+ "final_dropout": true,
35
+ "interleave_self_attention": true,
36
+ "norm_type": "ada_norm",
37
+ "num_attention_heads": 32,
38
+ "num_layers": 32,
39
+ "output_dim": 1024,
40
+ "positional_embeddings": null
41
+ },
42
+ "num_inference_timesteps": 4,
43
+ "noise_beta_alpha": 1.5,
44
+ "noise_beta_beta": 1.0,
45
+ "noise_s": 0.999,
46
+ "num_timestep_buckets": 1000,
47
+ "tune_projector": true,
48
+ "tune_diffusion_model": false,
49
+ "tune_vlln": false,
50
+ "state_dropout_prob": 0.0,
51
+ "state_additive_noise_scale": 0.0,
52
+ "max_num_embodiments": 32
53
+ }
checkpoint-20000/experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad263dbb9b0d0da51b35ecf98514bfa31f9bd4ce63b575600f9a552a4974493f
3
+ size 4992611704
checkpoint-20000/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cf480f5bf73530087d71890cf7a052f36cc564b771f327892967202d19a6bdf
3
+ size 4385706632
checkpoint-20000/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:046a73eeb58c8d1eb72a6d00c6549768e6099f2ab2f39ef13f52fff0c3238294
3
+ size 2615195266
checkpoint-20000/processor_config.json ADDED
@@ -0,0 +1,857 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6ScenarioProcessor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "action_configs": null
18
+ },
19
+ "state": {
20
+ "delta_indices": [
21
+ 0
22
+ ],
23
+ "modality_keys": [
24
+ "robot_pos",
25
+ "robot_ori_cos",
26
+ "robot_ori_sin",
27
+ "robot_2d_ori",
28
+ "robot_2d_ori_cos",
29
+ "robot_2d_ori_sin",
30
+ "robot_lin_vel",
31
+ "robot_ang_vel",
32
+ "arm_left_qpos",
33
+ "arm_left_qpos_sin",
34
+ "arm_left_qpos_cos",
35
+ "eef_left_pos",
36
+ "eef_left_quat",
37
+ "gripper_left_qpos",
38
+ "arm_right_qpos",
39
+ "arm_right_qpos_sin",
40
+ "arm_right_qpos_cos",
41
+ "eef_right_pos",
42
+ "eef_right_quat",
43
+ "gripper_right_qpos",
44
+ "trunk_qpos"
45
+ ],
46
+ "sin_cos_embedding_keys": null,
47
+ "mean_std_embedding_keys": null,
48
+ "action_configs": null
49
+ },
50
+ "action": {
51
+ "delta_indices": [
52
+ 0,
53
+ 1,
54
+ 2,
55
+ 3,
56
+ 4,
57
+ 5,
58
+ 6,
59
+ 7,
60
+ 8,
61
+ 9,
62
+ 10,
63
+ 11,
64
+ 12,
65
+ 13,
66
+ 14,
67
+ 15,
68
+ 16,
69
+ 17,
70
+ 18,
71
+ 19,
72
+ 20,
73
+ 21,
74
+ 22,
75
+ 23,
76
+ 24,
77
+ 25,
78
+ 26,
79
+ 27,
80
+ 28,
81
+ 29,
82
+ 30,
83
+ 31
84
+ ],
85
+ "modality_keys": [
86
+ "base",
87
+ "torso",
88
+ "left_arm",
89
+ "left_gripper",
90
+ "right_arm",
91
+ "right_gripper"
92
+ ],
93
+ "sin_cos_embedding_keys": null,
94
+ "mean_std_embedding_keys": null,
95
+ "action_configs": [
96
+ {
97
+ "rep": "ABSOLUTE",
98
+ "type": "NON_EEF",
99
+ "format": "DEFAULT",
100
+ "state_key": null
101
+ },
102
+ {
103
+ "rep": "RELATIVE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": "trunk_qpos"
107
+ },
108
+ {
109
+ "rep": "RELATIVE",
110
+ "type": "NON_EEF",
111
+ "format": "DEFAULT",
112
+ "state_key": "arm_left_qpos"
113
+ },
114
+ {
115
+ "rep": "ABSOLUTE",
116
+ "type": "NON_EEF",
117
+ "format": "DEFAULT",
118
+ "state_key": null
119
+ },
120
+ {
121
+ "rep": "RELATIVE",
122
+ "type": "NON_EEF",
123
+ "format": "DEFAULT",
124
+ "state_key": "arm_right_qpos"
125
+ },
126
+ {
127
+ "rep": "ABSOLUTE",
128
+ "type": "NON_EEF",
129
+ "format": "DEFAULT",
130
+ "state_key": null
131
+ }
132
+ ]
133
+ },
134
+ "language": {
135
+ "delta_indices": [
136
+ 0
137
+ ],
138
+ "modality_keys": [
139
+ "annotation.human.coarse_action"
140
+ ],
141
+ "sin_cos_embedding_keys": null,
142
+ "mean_std_embedding_keys": null,
143
+ "action_configs": null
144
+ }
145
+ },
146
+ "gr1": {
147
+ "video": {
148
+ "delta_indices": [
149
+ 0
150
+ ],
151
+ "modality_keys": [
152
+ "ego_view_bg_crop_pad_res256_freq20"
153
+ ],
154
+ "sin_cos_embedding_keys": null,
155
+ "mean_std_embedding_keys": null,
156
+ "action_configs": null
157
+ },
158
+ "state": {
159
+ "delta_indices": [
160
+ 0
161
+ ],
162
+ "modality_keys": [
163
+ "left_arm",
164
+ "right_arm",
165
+ "left_hand",
166
+ "right_hand",
167
+ "waist"
168
+ ],
169
+ "sin_cos_embedding_keys": [
170
+ "left_arm",
171
+ "right_arm",
172
+ "left_hand",
173
+ "right_hand",
174
+ "waist"
175
+ ],
176
+ "mean_std_embedding_keys": null,
177
+ "action_configs": null
178
+ },
179
+ "action": {
180
+ "delta_indices": [
181
+ 0,
182
+ 1,
183
+ 2,
184
+ 3,
185
+ 4,
186
+ 5,
187
+ 6,
188
+ 7,
189
+ 8,
190
+ 9,
191
+ 10,
192
+ 11,
193
+ 12,
194
+ 13,
195
+ 14,
196
+ 15
197
+ ],
198
+ "modality_keys": [
199
+ "left_arm",
200
+ "right_arm",
201
+ "left_hand",
202
+ "right_hand",
203
+ "waist"
204
+ ],
205
+ "sin_cos_embedding_keys": null,
206
+ "mean_std_embedding_keys": null,
207
+ "action_configs": [
208
+ {
209
+ "rep": "RELATIVE",
210
+ "type": "NON_EEF",
211
+ "format": "DEFAULT",
212
+ "state_key": null
213
+ },
214
+ {
215
+ "rep": "RELATIVE",
216
+ "type": "NON_EEF",
217
+ "format": "DEFAULT",
218
+ "state_key": null
219
+ },
220
+ {
221
+ "rep": "RELATIVE",
222
+ "type": "NON_EEF",
223
+ "format": "DEFAULT",
224
+ "state_key": null
225
+ },
226
+ {
227
+ "rep": "RELATIVE",
228
+ "type": "NON_EEF",
229
+ "format": "DEFAULT",
230
+ "state_key": null
231
+ },
232
+ {
233
+ "rep": "ABSOLUTE",
234
+ "type": "NON_EEF",
235
+ "format": "DEFAULT",
236
+ "state_key": null
237
+ }
238
+ ]
239
+ },
240
+ "language": {
241
+ "delta_indices": [
242
+ 0
243
+ ],
244
+ "modality_keys": [
245
+ "task"
246
+ ],
247
+ "sin_cos_embedding_keys": null,
248
+ "mean_std_embedding_keys": null,
249
+ "action_configs": null
250
+ }
251
+ },
252
+ "robocasa_panda_omron": {
253
+ "video": {
254
+ "delta_indices": [
255
+ 0
256
+ ],
257
+ "modality_keys": [
258
+ "res256_image_side_0",
259
+ "res256_image_side_1",
260
+ "res256_image_wrist_0"
261
+ ],
262
+ "sin_cos_embedding_keys": null,
263
+ "mean_std_embedding_keys": null,
264
+ "action_configs": null
265
+ },
266
+ "state": {
267
+ "delta_indices": [
268
+ 0
269
+ ],
270
+ "modality_keys": [
271
+ "end_effector_position_relative",
272
+ "end_effector_rotation_relative",
273
+ "gripper_qpos",
274
+ "base_position",
275
+ "base_rotation"
276
+ ],
277
+ "sin_cos_embedding_keys": null,
278
+ "mean_std_embedding_keys": null,
279
+ "action_configs": null
280
+ },
281
+ "action": {
282
+ "delta_indices": [
283
+ 0,
284
+ 1,
285
+ 2,
286
+ 3,
287
+ 4,
288
+ 5,
289
+ 6,
290
+ 7,
291
+ 8,
292
+ 9,
293
+ 10,
294
+ 11,
295
+ 12,
296
+ 13,
297
+ 14,
298
+ 15
299
+ ],
300
+ "modality_keys": [
301
+ "end_effector_position",
302
+ "end_effector_rotation",
303
+ "gripper_close",
304
+ "base_motion",
305
+ "control_mode"
306
+ ],
307
+ "sin_cos_embedding_keys": null,
308
+ "mean_std_embedding_keys": null,
309
+ "action_configs": [
310
+ {
311
+ "rep": "ABSOLUTE",
312
+ "type": "NON_EEF",
313
+ "format": "DEFAULT",
314
+ "state_key": null
315
+ },
316
+ {
317
+ "rep": "ABSOLUTE",
318
+ "type": "NON_EEF",
319
+ "format": "DEFAULT",
320
+ "state_key": null
321
+ },
322
+ {
323
+ "rep": "ABSOLUTE",
324
+ "type": "NON_EEF",
325
+ "format": "DEFAULT",
326
+ "state_key": null
327
+ },
328
+ {
329
+ "rep": "ABSOLUTE",
330
+ "type": "NON_EEF",
331
+ "format": "DEFAULT",
332
+ "state_key": null
333
+ },
334
+ {
335
+ "rep": "ABSOLUTE",
336
+ "type": "NON_EEF",
337
+ "format": "DEFAULT",
338
+ "state_key": null
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "annotation.human.action.task_description"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "action_configs": null
352
+ }
353
+ },
354
+ "libero_panda": {
355
+ "video": {
356
+ "delta_indices": [
357
+ 0
358
+ ],
359
+ "modality_keys": [
360
+ "image",
361
+ "wrist_image"
362
+ ],
363
+ "sin_cos_embedding_keys": null,
364
+ "mean_std_embedding_keys": null,
365
+ "action_configs": null
366
+ },
367
+ "state": {
368
+ "delta_indices": [
369
+ 0
370
+ ],
371
+ "modality_keys": [
372
+ "x",
373
+ "y",
374
+ "z",
375
+ "roll",
376
+ "pitch",
377
+ "yaw",
378
+ "gripper"
379
+ ],
380
+ "sin_cos_embedding_keys": null,
381
+ "mean_std_embedding_keys": null,
382
+ "action_configs": null
383
+ },
384
+ "action": {
385
+ "delta_indices": [
386
+ 0,
387
+ 1,
388
+ 2,
389
+ 3,
390
+ 4,
391
+ 5,
392
+ 6,
393
+ 7,
394
+ 8,
395
+ 9,
396
+ 10,
397
+ 11,
398
+ 12,
399
+ 13,
400
+ 14,
401
+ 15
402
+ ],
403
+ "modality_keys": [
404
+ "x",
405
+ "y",
406
+ "z",
407
+ "roll",
408
+ "pitch",
409
+ "yaw",
410
+ "gripper"
411
+ ],
412
+ "sin_cos_embedding_keys": null,
413
+ "mean_std_embedding_keys": null,
414
+ "action_configs": [
415
+ {
416
+ "rep": "ABSOLUTE",
417
+ "type": "NON_EEF",
418
+ "format": "DEFAULT",
419
+ "state_key": null
420
+ },
421
+ {
422
+ "rep": "ABSOLUTE",
423
+ "type": "NON_EEF",
424
+ "format": "DEFAULT",
425
+ "state_key": null
426
+ },
427
+ {
428
+ "rep": "ABSOLUTE",
429
+ "type": "NON_EEF",
430
+ "format": "DEFAULT",
431
+ "state_key": null
432
+ },
433
+ {
434
+ "rep": "ABSOLUTE",
435
+ "type": "NON_EEF",
436
+ "format": "DEFAULT",
437
+ "state_key": null
438
+ },
439
+ {
440
+ "rep": "ABSOLUTE",
441
+ "type": "NON_EEF",
442
+ "format": "DEFAULT",
443
+ "state_key": null
444
+ },
445
+ {
446
+ "rep": "ABSOLUTE",
447
+ "type": "NON_EEF",
448
+ "format": "DEFAULT",
449
+ "state_key": null
450
+ },
451
+ {
452
+ "rep": "ABSOLUTE",
453
+ "type": "NON_EEF",
454
+ "format": "DEFAULT",
455
+ "state_key": null
456
+ }
457
+ ]
458
+ },
459
+ "language": {
460
+ "delta_indices": [
461
+ 0
462
+ ],
463
+ "modality_keys": [
464
+ "annotation.human.action.task_description"
465
+ ],
466
+ "sin_cos_embedding_keys": null,
467
+ "mean_std_embedding_keys": null,
468
+ "action_configs": null
469
+ }
470
+ },
471
+ "unitree_g1": {
472
+ "video": {
473
+ "delta_indices": [
474
+ 0
475
+ ],
476
+ "modality_keys": [
477
+ "ego_view"
478
+ ],
479
+ "sin_cos_embedding_keys": null,
480
+ "mean_std_embedding_keys": null,
481
+ "action_configs": null
482
+ },
483
+ "state": {
484
+ "delta_indices": [
485
+ 0
486
+ ],
487
+ "modality_keys": [
488
+ "left_leg",
489
+ "right_leg",
490
+ "waist",
491
+ "left_arm",
492
+ "right_arm",
493
+ "left_hand",
494
+ "right_hand"
495
+ ],
496
+ "sin_cos_embedding_keys": null,
497
+ "mean_std_embedding_keys": null,
498
+ "action_configs": null
499
+ },
500
+ "action": {
501
+ "delta_indices": [
502
+ 0,
503
+ 1,
504
+ 2,
505
+ 3,
506
+ 4,
507
+ 5,
508
+ 6,
509
+ 7,
510
+ 8,
511
+ 9,
512
+ 10,
513
+ 11,
514
+ 12,
515
+ 13,
516
+ 14,
517
+ 15,
518
+ 16,
519
+ 17,
520
+ 18,
521
+ 19,
522
+ 20,
523
+ 21,
524
+ 22,
525
+ 23,
526
+ 24,
527
+ 25,
528
+ 26,
529
+ 27,
530
+ 28,
531
+ 29
532
+ ],
533
+ "modality_keys": [
534
+ "left_arm",
535
+ "right_arm",
536
+ "left_hand",
537
+ "right_hand",
538
+ "waist",
539
+ "base_height_command",
540
+ "navigate_command"
541
+ ],
542
+ "sin_cos_embedding_keys": null,
543
+ "mean_std_embedding_keys": null,
544
+ "action_configs": [
545
+ {
546
+ "rep": "RELATIVE",
547
+ "type": "NON_EEF",
548
+ "format": "DEFAULT",
549
+ "state_key": null
550
+ },
551
+ {
552
+ "rep": "RELATIVE",
553
+ "type": "NON_EEF",
554
+ "format": "DEFAULT",
555
+ "state_key": null
556
+ },
557
+ {
558
+ "rep": "ABSOLUTE",
559
+ "type": "NON_EEF",
560
+ "format": "DEFAULT",
561
+ "state_key": null
562
+ },
563
+ {
564
+ "rep": "ABSOLUTE",
565
+ "type": "NON_EEF",
566
+ "format": "DEFAULT",
567
+ "state_key": null
568
+ },
569
+ {
570
+ "rep": "ABSOLUTE",
571
+ "type": "NON_EEF",
572
+ "format": "DEFAULT",
573
+ "state_key": null
574
+ },
575
+ {
576
+ "rep": "ABSOLUTE",
577
+ "type": "NON_EEF",
578
+ "format": "DEFAULT",
579
+ "state_key": null
580
+ },
581
+ {
582
+ "rep": "ABSOLUTE",
583
+ "type": "NON_EEF",
584
+ "format": "DEFAULT",
585
+ "state_key": null
586
+ }
587
+ ]
588
+ },
589
+ "language": {
590
+ "delta_indices": [
591
+ 0
592
+ ],
593
+ "modality_keys": [
594
+ "annotation.human.task_description"
595
+ ],
596
+ "sin_cos_embedding_keys": null,
597
+ "mean_std_embedding_keys": null,
598
+ "action_configs": null
599
+ }
600
+ },
601
+ "oxe_widowx": {
602
+ "video": {
603
+ "delta_indices": [
604
+ 0
605
+ ],
606
+ "modality_keys": [
607
+ "image_0"
608
+ ],
609
+ "sin_cos_embedding_keys": null,
610
+ "mean_std_embedding_keys": null,
611
+ "action_configs": null
612
+ },
613
+ "state": {
614
+ "delta_indices": [
615
+ 0
616
+ ],
617
+ "modality_keys": [
618
+ "x",
619
+ "y",
620
+ "z",
621
+ "roll",
622
+ "pitch",
623
+ "yaw",
624
+ "pad",
625
+ "gripper"
626
+ ],
627
+ "sin_cos_embedding_keys": null,
628
+ "mean_std_embedding_keys": null,
629
+ "action_configs": null
630
+ },
631
+ "action": {
632
+ "delta_indices": [
633
+ 0,
634
+ 1,
635
+ 2,
636
+ 3,
637
+ 4,
638
+ 5,
639
+ 6,
640
+ 7
641
+ ],
642
+ "modality_keys": [
643
+ "x",
644
+ "y",
645
+ "z",
646
+ "roll",
647
+ "pitch",
648
+ "yaw",
649
+ "gripper"
650
+ ],
651
+ "sin_cos_embedding_keys": null,
652
+ "mean_std_embedding_keys": [
653
+ "x",
654
+ "y",
655
+ "z",
656
+ "roll",
657
+ "pitch",
658
+ "yaw"
659
+ ],
660
+ "action_configs": [
661
+ {
662
+ "rep": "ABSOLUTE",
663
+ "type": "NON_EEF",
664
+ "format": "DEFAULT",
665
+ "state_key": null
666
+ },
667
+ {
668
+ "rep": "ABSOLUTE",
669
+ "type": "NON_EEF",
670
+ "format": "DEFAULT",
671
+ "state_key": null
672
+ },
673
+ {
674
+ "rep": "ABSOLUTE",
675
+ "type": "NON_EEF",
676
+ "format": "DEFAULT",
677
+ "state_key": null
678
+ },
679
+ {
680
+ "rep": "ABSOLUTE",
681
+ "type": "NON_EEF",
682
+ "format": "DEFAULT",
683
+ "state_key": null
684
+ },
685
+ {
686
+ "rep": "ABSOLUTE",
687
+ "type": "NON_EEF",
688
+ "format": "DEFAULT",
689
+ "state_key": null
690
+ },
691
+ {
692
+ "rep": "ABSOLUTE",
693
+ "type": "NON_EEF",
694
+ "format": "DEFAULT",
695
+ "state_key": null
696
+ },
697
+ {
698
+ "rep": "ABSOLUTE",
699
+ "type": "NON_EEF",
700
+ "format": "DEFAULT",
701
+ "state_key": null
702
+ }
703
+ ]
704
+ },
705
+ "language": {
706
+ "delta_indices": [
707
+ 0
708
+ ],
709
+ "modality_keys": [
710
+ "annotation.human.action.task_description"
711
+ ],
712
+ "sin_cos_embedding_keys": null,
713
+ "mean_std_embedding_keys": null,
714
+ "action_configs": null
715
+ }
716
+ },
717
+ "oxe_google": {
718
+ "video": {
719
+ "delta_indices": [
720
+ 0
721
+ ],
722
+ "modality_keys": [
723
+ "image"
724
+ ],
725
+ "sin_cos_embedding_keys": null,
726
+ "mean_std_embedding_keys": null,
727
+ "action_configs": null
728
+ },
729
+ "state": {
730
+ "delta_indices": [
731
+ 0
732
+ ],
733
+ "modality_keys": [
734
+ "x",
735
+ "y",
736
+ "z",
737
+ "rx",
738
+ "ry",
739
+ "rz",
740
+ "rw",
741
+ "gripper"
742
+ ],
743
+ "sin_cos_embedding_keys": null,
744
+ "mean_std_embedding_keys": null,
745
+ "action_configs": null
746
+ },
747
+ "action": {
748
+ "delta_indices": [
749
+ 0,
750
+ 1,
751
+ 2,
752
+ 3,
753
+ 4,
754
+ 5,
755
+ 6,
756
+ 7
757
+ ],
758
+ "modality_keys": [
759
+ "x",
760
+ "y",
761
+ "z",
762
+ "roll",
763
+ "pitch",
764
+ "yaw",
765
+ "gripper"
766
+ ],
767
+ "sin_cos_embedding_keys": null,
768
+ "mean_std_embedding_keys": [
769
+ "x",
770
+ "y",
771
+ "z",
772
+ "roll",
773
+ "pitch",
774
+ "yaw"
775
+ ],
776
+ "action_configs": [
777
+ {
778
+ "rep": "ABSOLUTE",
779
+ "type": "NON_EEF",
780
+ "format": "DEFAULT",
781
+ "state_key": null
782
+ },
783
+ {
784
+ "rep": "ABSOLUTE",
785
+ "type": "NON_EEF",
786
+ "format": "DEFAULT",
787
+ "state_key": null
788
+ },
789
+ {
790
+ "rep": "ABSOLUTE",
791
+ "type": "NON_EEF",
792
+ "format": "DEFAULT",
793
+ "state_key": null
794
+ },
795
+ {
796
+ "rep": "ABSOLUTE",
797
+ "type": "NON_EEF",
798
+ "format": "DEFAULT",
799
+ "state_key": null
800
+ },
801
+ {
802
+ "rep": "ABSOLUTE",
803
+ "type": "NON_EEF",
804
+ "format": "DEFAULT",
805
+ "state_key": null
806
+ },
807
+ {
808
+ "rep": "ABSOLUTE",
809
+ "type": "NON_EEF",
810
+ "format": "DEFAULT",
811
+ "state_key": null
812
+ },
813
+ {
814
+ "rep": "ABSOLUTE",
815
+ "type": "NON_EEF",
816
+ "format": "DEFAULT",
817
+ "state_key": null
818
+ }
819
+ ]
820
+ },
821
+ "language": {
822
+ "delta_indices": [
823
+ 0
824
+ ],
825
+ "modality_keys": [
826
+ "annotation.human.action.task_description"
827
+ ],
828
+ "sin_cos_embedding_keys": null,
829
+ "mean_std_embedding_keys": null,
830
+ "action_configs": null
831
+ }
832
+ }
833
+ },
834
+ "image_crop_size": null,
835
+ "image_target_size": null,
836
+ "use_albumentations": true,
837
+ "random_rotation_angle": null,
838
+ "color_jitter_params": {
839
+ "brightness": 0.3,
840
+ "contrast": 0.4,
841
+ "saturation": 0.5,
842
+ "hue": 0.08
843
+ },
844
+ "shortest_image_edge": 256,
845
+ "crop_fraction": 0.95,
846
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
847
+ "model_type": "eagle",
848
+ "formalize_language": true,
849
+ "max_state_dim": 128,
850
+ "max_action_dim": 128,
851
+ "max_action_horizon": 50,
852
+ "use_percentiles": false,
853
+ "clip_outliers": true,
854
+ "apply_sincos_state_encoding": true,
855
+ "use_relative_action": true
856
+ }
857
+ }
checkpoint-20000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0558e57789e68480e96dcf4075c3e42552fe8abb6b893a56d641c7984c38455
3
+ size 15173
checkpoint-20000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14df497b7b7463d05e405afbc22db528e4524dba3422577d1f8094b1f3780d14
3
+ size 15173
checkpoint-20000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5500565343718ea964549974c4f17305680a38e9ec97ed6fc58fc814fd66de54
3
+ size 15173
checkpoint-20000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fa802a80def971b73ec74284a6aa44d0b2ea101bd38ed41a3b1c1a0b4001f00
3
+ size 1465
checkpoint-20000/statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b79c4dd31bacb974d13835e6daead4b8fed3e76becb4c5e306d20af9f1f9ae78
3
+ size 5777
checkpoint-20000/wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d6", "run_id": "stage_1_2"}
config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_bos_embedding": true,
4
+ "add_pos_embed": true,
5
+ "apply_sincos_state_encoding": true,
6
+ "architectures": [
7
+ "Gr00tN1d6"
8
+ ],
9
+ "attn_dropout": 0.2,
10
+ "attn_implementation": null,
11
+ "backbone_embedding_dim": 2048,
12
+ "backbone_model_type": "eagle",
13
+ "backbone_trainable_params_fp32": true,
14
+ "block_size": 256,
15
+ "collator_overwrite_image_inputs": false,
16
+ "color_jitter_params": {
17
+ "brightness": 0.1,
18
+ "contrast": 0.1,
19
+ "hue": 0.1,
20
+ "saturation": 0.1
21
+ },
22
+ "crop_fraction": 0.95,
23
+ "decay_frequency": 1,
24
+ "diffusion_model_cfg": {
25
+ "attention_head_dim": 48,
26
+ "dropout": 0.2,
27
+ "final_dropout": true,
28
+ "interleave_self_attention": true,
29
+ "norm_type": "ada_norm",
30
+ "num_attention_heads": 32,
31
+ "num_layers": 32,
32
+ "output_dim": 1024,
33
+ "positional_embeddings": null
34
+ },
35
+ "drop_ltm_memory": true,
36
+ "eagle_collator": true,
37
+ "formalize_language": true,
38
+ "gemma_collator": false,
39
+ "hidden_size": 1024,
40
+ "image_crop_size": null,
41
+ "image_target_size": null,
42
+ "initial_recall_frequency": 10,
43
+ "input_embedding_dim": 1536,
44
+ "load_bf16": true,
45
+ "ltm_retrieve_blocks": 10,
46
+ "max_action_dim": 128,
47
+ "max_num_embodiments": 32,
48
+ "max_seq_len": 1024,
49
+ "max_state_dim": 128,
50
+ "model_dtype": "bfloat16",
51
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
52
+ "model_type": "Gr00tN1d6",
53
+ "noise_beta_alpha": 1.5,
54
+ "noise_beta_beta": 1.0,
55
+ "noise_s": 0.999,
56
+ "num_inference_timesteps": 4,
57
+ "num_ltm_blocks": 10,
58
+ "num_selector_layers": 2,
59
+ "num_timestep_buckets": 1000,
60
+ "num_total_blocks": 50,
61
+ "random_retriever_length": false,
62
+ "random_rotation_angle": null,
63
+ "reproject_vision": false,
64
+ "select_layer": 16,
65
+ "selector_hidden_dim": 256,
66
+ "shortest_image_edge": 256,
67
+ "state_dropout_prob": 0.0,
68
+ "torch_dtype": "bfloat16",
69
+ "transformers_version": "4.53.2",
70
+ "tune_diffusion_model": false,
71
+ "tune_enc_query_proj": false,
72
+ "tune_key_proj": false,
73
+ "tune_llm": false,
74
+ "tune_memory_bos_embedding": false,
75
+ "tune_memory_loras": false,
76
+ "tune_memory_pos_embeddings": false,
77
+ "tune_projector": true,
78
+ "tune_query_proj": false,
79
+ "tune_top_llm_layers": 4,
80
+ "tune_visual": false,
81
+ "tune_vlln": false,
82
+ "use_albumentations_transforms": true,
83
+ "use_alternate_vl_dit": true,
84
+ "use_flash_attention": true,
85
+ "use_relative_action": true,
86
+ "use_vlln": true
87
+ }
experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d6
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Eagle-Block2A-2B-v2
6
+ backbone_model_type: eagle
7
+ model_revision: null
8
+ tune_top_llm_layers: 0
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 16
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ collator_overwrite_image_inputs: false
17
+ eagle_collator: true
18
+ backbone_trainable_params_fp32: true
19
+ image_crop_size: null
20
+ image_target_size: null
21
+ shortest_image_edge: 256
22
+ crop_fraction: 0.95
23
+ random_rotation_angle: null
24
+ color_jitter_params: null
25
+ use_albumentations_transforms: true
26
+ formalize_language: true
27
+ apply_sincos_state_encoding: false
28
+ use_relative_action: true
29
+ max_state_dim: 29
30
+ max_action_dim: 29
31
+ action_horizon: 16
32
+ hidden_size: 1024
33
+ input_embedding_dim: 1536
34
+ add_pos_embed: true
35
+ attn_dropout: 0.2
36
+ use_vlln: true
37
+ max_seq_len: 1024
38
+ use_alternate_vl_dit: true
39
+ attend_text_every_n_blocks: 2
40
+ diffusion_model_cfg:
41
+ positional_embeddings: null
42
+ num_layers: 32
43
+ num_attention_heads: 32
44
+ attention_head_dim: 48
45
+ norm_type: ada_norm
46
+ dropout: 0.2
47
+ final_dropout: true
48
+ output_dim: 1024
49
+ interleave_self_attention: true
50
+ num_inference_timesteps: 4
51
+ noise_beta_alpha: 1.5
52
+ noise_beta_beta: 1.0
53
+ noise_s: 0.999
54
+ num_timestep_buckets: 1000
55
+ tune_projector: true
56
+ tune_diffusion_model: false
57
+ tune_vlln: false
58
+ state_dropout_prob: 0.0
59
+ state_additive_noise_scale: 0.0
60
+ max_num_embodiments: 32
61
+ data:
62
+ datasets:
63
+ - dataset_paths:
64
+ - /workspace/gr00t/examples/LIBERO/libero_goal_no_noops_1.0.0_lerobot
65
+ embodiment_tag: libero_panda
66
+ mix_ratio: 1.0
67
+ dataset_type: physical_embodiment
68
+ val_dataset_path: null
69
+ video_backend: torchcodec
70
+ - dataset_paths:
71
+ - /workspace/gr00t/examples/LIBERO/libero_10_no_noops_1.0.0_lerobot
72
+ embodiment_tag: libero_panda
73
+ mix_ratio: 1.0
74
+ dataset_type: physical_embodiment
75
+ val_dataset_path: null
76
+ video_backend: torchcodec
77
+ - dataset_paths:
78
+ - /workspace/gr00t/examples/LIBERO/libero_object_no_noops_1.0.0_lerobot
79
+ embodiment_tag: libero_panda
80
+ mix_ratio: 1.0
81
+ dataset_type: physical_embodiment
82
+ val_dataset_path: null
83
+ video_backend: torchcodec
84
+ - dataset_paths:
85
+ - /workspace/gr00t/examples/LIBERO/libero_spatial_no_noops_1.0.0_lerobot
86
+ embodiment_tag: libero_panda
87
+ mix_ratio: 1.0
88
+ dataset_type: physical_embodiment
89
+ val_dataset_path: null
90
+ video_backend: torchcodec
91
+ - dataset_paths:
92
+ - /workspace/gr00t/examples/SimplerEnv/bridge_orig_lerobot/
93
+ embodiment_tag: oxe_widowx
94
+ mix_ratio: 1.0
95
+ dataset_type: physical_embodiment
96
+ val_dataset_path: null
97
+ video_backend: torchcodec
98
+ - dataset_paths:
99
+ - /workspace/gr00t/examples/SimplerEnv/fractal20220817_data_lerobot/
100
+ embodiment_tag: oxe_google
101
+ mix_ratio: 1.0
102
+ dataset_type: physical_embodiment
103
+ val_dataset_path: null
104
+ video_backend: torchcodec
105
+ - dataset_paths:
106
+ - /workspace/gr00t/examples/GR00T-WholeBodyControl/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim/unitree_g1.LMPnPAppleToPlateDC/
107
+ embodiment_tag: unitree_g1
108
+ mix_ratio: 1.0
109
+ dataset_type: physical_embodiment
110
+ val_dataset_path: null
111
+ video_backend: torchcodec
112
+ modality_configs:
113
+ oxe_google:
114
+ video:
115
+ delta_indices:
116
+ - 0
117
+ modality_keys:
118
+ - image
119
+ sin_cos_embedding_keys: null
120
+ mean_std_embedding_keys: null
121
+ action_configs: null
122
+ state:
123
+ delta_indices:
124
+ - 0
125
+ modality_keys:
126
+ - x
127
+ - 'y'
128
+ - z
129
+ - rx
130
+ - ry
131
+ - rz
132
+ - rw
133
+ - gripper
134
+ sin_cos_embedding_keys: null
135
+ mean_std_embedding_keys: null
136
+ action_configs: null
137
+ action:
138
+ delta_indices:
139
+ - 0
140
+ - 1
141
+ - 2
142
+ - 3
143
+ - 4
144
+ - 5
145
+ - 6
146
+ - 7
147
+ modality_keys:
148
+ - x
149
+ - 'y'
150
+ - z
151
+ - roll
152
+ - pitch
153
+ - yaw
154
+ - gripper
155
+ sin_cos_embedding_keys: null
156
+ mean_std_embedding_keys:
157
+ - x
158
+ - 'y'
159
+ - z
160
+ - roll
161
+ - pitch
162
+ - yaw
163
+ action_configs:
164
+ - rep: ABSOLUTE
165
+ type: NON_EEF
166
+ format: DEFAULT
167
+ state_key: null
168
+ - rep: ABSOLUTE
169
+ type: NON_EEF
170
+ format: DEFAULT
171
+ state_key: null
172
+ - rep: ABSOLUTE
173
+ type: NON_EEF
174
+ format: DEFAULT
175
+ state_key: null
176
+ - rep: ABSOLUTE
177
+ type: NON_EEF
178
+ format: DEFAULT
179
+ state_key: null
180
+ - rep: ABSOLUTE
181
+ type: NON_EEF
182
+ format: DEFAULT
183
+ state_key: null
184
+ - rep: ABSOLUTE
185
+ type: NON_EEF
186
+ format: DEFAULT
187
+ state_key: null
188
+ - rep: ABSOLUTE
189
+ type: NON_EEF
190
+ format: DEFAULT
191
+ state_key: null
192
+ language:
193
+ delta_indices:
194
+ - 0
195
+ modality_keys:
196
+ - annotation.human.action.task_description
197
+ sin_cos_embedding_keys: null
198
+ mean_std_embedding_keys: null
199
+ action_configs: null
200
+ unitree_g1:
201
+ video:
202
+ delta_indices:
203
+ - 0
204
+ modality_keys:
205
+ - ego_view
206
+ sin_cos_embedding_keys: null
207
+ mean_std_embedding_keys: null
208
+ action_configs: null
209
+ state:
210
+ delta_indices:
211
+ - 0
212
+ modality_keys:
213
+ - left_leg
214
+ - right_leg
215
+ - waist
216
+ - left_arm
217
+ - right_arm
218
+ - left_hand
219
+ - right_hand
220
+ sin_cos_embedding_keys: null
221
+ mean_std_embedding_keys: null
222
+ action_configs: null
223
+ action:
224
+ delta_indices:
225
+ - 0
226
+ - 1
227
+ - 2
228
+ - 3
229
+ - 4
230
+ - 5
231
+ - 6
232
+ - 7
233
+ - 8
234
+ - 9
235
+ - 10
236
+ - 11
237
+ - 12
238
+ - 13
239
+ - 14
240
+ - 15
241
+ - 16
242
+ - 17
243
+ - 18
244
+ - 19
245
+ - 20
246
+ - 21
247
+ - 22
248
+ - 23
249
+ - 24
250
+ - 25
251
+ - 26
252
+ - 27
253
+ - 28
254
+ - 29
255
+ modality_keys:
256
+ - left_arm
257
+ - right_arm
258
+ - left_hand
259
+ - right_hand
260
+ - waist
261
+ - base_height_command
262
+ - navigate_command
263
+ sin_cos_embedding_keys: null
264
+ mean_std_embedding_keys: null
265
+ action_configs:
266
+ - rep: RELATIVE
267
+ type: NON_EEF
268
+ format: DEFAULT
269
+ state_key: null
270
+ - rep: RELATIVE
271
+ type: NON_EEF
272
+ format: DEFAULT
273
+ state_key: null
274
+ - rep: ABSOLUTE
275
+ type: NON_EEF
276
+ format: DEFAULT
277
+ state_key: null
278
+ - rep: ABSOLUTE
279
+ type: NON_EEF
280
+ format: DEFAULT
281
+ state_key: null
282
+ - rep: ABSOLUTE
283
+ type: NON_EEF
284
+ format: DEFAULT
285
+ state_key: null
286
+ - rep: ABSOLUTE
287
+ type: NON_EEF
288
+ format: DEFAULT
289
+ state_key: null
290
+ - rep: ABSOLUTE
291
+ type: NON_EEF
292
+ format: DEFAULT
293
+ state_key: null
294
+ language:
295
+ delta_indices:
296
+ - 0
297
+ modality_keys:
298
+ - annotation.human.task_description
299
+ sin_cos_embedding_keys: null
300
+ mean_std_embedding_keys: null
301
+ action_configs: null
302
+ oxe_widowx:
303
+ video:
304
+ delta_indices:
305
+ - 0
306
+ modality_keys:
307
+ - image_0
308
+ sin_cos_embedding_keys: null
309
+ mean_std_embedding_keys: null
310
+ action_configs: null
311
+ state:
312
+ delta_indices:
313
+ - 0
314
+ modality_keys:
315
+ - x
316
+ - 'y'
317
+ - z
318
+ - roll
319
+ - pitch
320
+ - yaw
321
+ - pad
322
+ - gripper
323
+ sin_cos_embedding_keys: null
324
+ mean_std_embedding_keys: null
325
+ action_configs: null
326
+ action:
327
+ delta_indices:
328
+ - 0
329
+ - 1
330
+ - 2
331
+ - 3
332
+ - 4
333
+ - 5
334
+ - 6
335
+ - 7
336
+ modality_keys:
337
+ - x
338
+ - 'y'
339
+ - z
340
+ - roll
341
+ - pitch
342
+ - yaw
343
+ - gripper
344
+ sin_cos_embedding_keys: null
345
+ mean_std_embedding_keys:
346
+ - x
347
+ - 'y'
348
+ - z
349
+ - roll
350
+ - pitch
351
+ - yaw
352
+ action_configs:
353
+ - rep: ABSOLUTE
354
+ type: NON_EEF
355
+ format: DEFAULT
356
+ state_key: null
357
+ - rep: ABSOLUTE
358
+ type: NON_EEF
359
+ format: DEFAULT
360
+ state_key: null
361
+ - rep: ABSOLUTE
362
+ type: NON_EEF
363
+ format: DEFAULT
364
+ state_key: null
365
+ - rep: ABSOLUTE
366
+ type: NON_EEF
367
+ format: DEFAULT
368
+ state_key: null
369
+ - rep: ABSOLUTE
370
+ type: NON_EEF
371
+ format: DEFAULT
372
+ state_key: null
373
+ - rep: ABSOLUTE
374
+ type: NON_EEF
375
+ format: DEFAULT
376
+ state_key: null
377
+ - rep: ABSOLUTE
378
+ type: NON_EEF
379
+ format: DEFAULT
380
+ state_key: null
381
+ language:
382
+ delta_indices:
383
+ - 0
384
+ modality_keys:
385
+ - annotation.human.action.task_description
386
+ sin_cos_embedding_keys: null
387
+ mean_std_embedding_keys: null
388
+ action_configs: null
389
+ libero_panda:
390
+ video:
391
+ delta_indices:
392
+ - 0
393
+ modality_keys:
394
+ - image
395
+ - wrist_image
396
+ sin_cos_embedding_keys: null
397
+ mean_std_embedding_keys: null
398
+ action_configs: null
399
+ state:
400
+ delta_indices:
401
+ - 0
402
+ modality_keys:
403
+ - x
404
+ - 'y'
405
+ - z
406
+ - roll
407
+ - pitch
408
+ - yaw
409
+ - gripper
410
+ sin_cos_embedding_keys: null
411
+ mean_std_embedding_keys: null
412
+ action_configs: null
413
+ action:
414
+ delta_indices:
415
+ - 0
416
+ - 1
417
+ - 2
418
+ - 3
419
+ - 4
420
+ - 5
421
+ - 6
422
+ - 7
423
+ - 8
424
+ - 9
425
+ - 10
426
+ - 11
427
+ - 12
428
+ - 13
429
+ - 14
430
+ - 15
431
+ modality_keys:
432
+ - x
433
+ - 'y'
434
+ - z
435
+ - roll
436
+ - pitch
437
+ - yaw
438
+ - gripper
439
+ sin_cos_embedding_keys: null
440
+ mean_std_embedding_keys: null
441
+ action_configs:
442
+ - rep: ABSOLUTE
443
+ type: NON_EEF
444
+ format: DEFAULT
445
+ state_key: null
446
+ - rep: ABSOLUTE
447
+ type: NON_EEF
448
+ format: DEFAULT
449
+ state_key: null
450
+ - rep: ABSOLUTE
451
+ type: NON_EEF
452
+ format: DEFAULT
453
+ state_key: null
454
+ - rep: ABSOLUTE
455
+ type: NON_EEF
456
+ format: DEFAULT
457
+ state_key: null
458
+ - rep: ABSOLUTE
459
+ type: NON_EEF
460
+ format: DEFAULT
461
+ state_key: null
462
+ - rep: ABSOLUTE
463
+ type: NON_EEF
464
+ format: DEFAULT
465
+ state_key: null
466
+ - rep: ABSOLUTE
467
+ type: NON_EEF
468
+ format: DEFAULT
469
+ state_key: null
470
+ language:
471
+ delta_indices:
472
+ - 0
473
+ modality_keys:
474
+ - annotation.human.action.task_description
475
+ sin_cos_embedding_keys: null
476
+ mean_std_embedding_keys: null
477
+ action_configs: null
478
+ download_cache: false
479
+ shard_size: 1024
480
+ episode_sampling_rate: 0.1
481
+ num_shards_per_epoch: 100000
482
+ override_pretraining_statistics: false
483
+ mode: scenario
484
+ random_chop: 0.0
485
+ mock_dataset_mode: false
486
+ shuffle: true
487
+ seed: 42
488
+ multiprocessing_context: fork
489
+ allow_padding: false
490
+ subsample_ratio: 1.0
491
+ image_crop_size:
492
+ - 244
493
+ - 244
494
+ image_target_size:
495
+ - 224
496
+ - 224
497
+ video_backend: torchcodec
498
+ training:
499
+ output_dir: models/attempt_2/stage_1_2/
500
+ experiment_name: null
501
+ max_steps: 20000
502
+ global_batch_size: 12
503
+ batch_size: null
504
+ gradient_accumulation_steps: 1
505
+ learning_rate: 0.0001
506
+ lr_scheduler_type: cosine
507
+ weight_decay: 1.0e-05
508
+ warmup_ratio: 0.05
509
+ warmup_steps: 0
510
+ max_grad_norm: 1.0
511
+ optim: adamw_torch_fused
512
+ start_from_checkpoint: models/attempt_2/stage_1/checkpoint-12000/
513
+ tf32: true
514
+ fp16: false
515
+ bf16: true
516
+ eval_bf16: true
517
+ logging_steps: 10
518
+ save_steps: 2000
519
+ save_total_limit: 5
520
+ save_vl_model: false
521
+ upload_checkpoints: false
522
+ upload_every: 1000
523
+ upload_last_n_checkpoints: 5
524
+ max_concurrent_uploads: 2
525
+ eval_strategy: 'no'
526
+ eval_steps: 500
527
+ eval_set_split_ratio: 0.1
528
+ eval_batch_size: 2
529
+ save_best_eval_metric_name: ''
530
+ save_best_eval_metric_greater_is_better: true
531
+ deepspeed_stage: 2
532
+ gradient_checkpointing: false
533
+ transformers_trust_remote_code: true
534
+ transformers_local_files_only: false
535
+ transformers_cache_dir: null
536
+ transformers_access_token: null
537
+ use_ddp: false
538
+ ddp_bucket_cap_mb: 100
539
+ num_gpus: 1
540
+ dataloader_num_workers: 9
541
+ remove_unused_columns: false
542
+ use_wandb: true
543
+ wandb_project: finetune-gr00t-n1d6
544
+ enable_profiling: false
545
+ max_retries: 3
546
+ assert_loss_less_than: null
547
+ add_rl_callback: false
548
+ enable_open_loop_eval: false
549
+ open_loop_eval_traj_ids:
550
+ - 0
551
+ open_loop_eval_steps_per_traj: 100
552
+ open_loop_eval_plot_indices: null
553
+ scenario: gr00t.experiment_configs.stage_1.Stage1Dataset
554
+ experiment: gr00t.experiment_configs.stage_1.Stage1Experiment
555
+ max_steps: 20000
556
+ save_steps: 2000
experiment_cfg/config.yaml ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.MemoryExperimentConfig
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - /workspace/gr00t/examples/LIBERO/libero_goal_no_noops_1.0.0_lerobot
8
+ dataset_type: physical_embodiment
9
+ embodiment_tag: libero_panda
10
+ mix_ratio: 1.0
11
+ val_dataset_path: null
12
+ video_backend: torchcodec
13
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
14
+ dataset_paths:
15
+ - /workspace/gr00t/examples/LIBERO/libero_10_no_noops_1.0.0_lerobot
16
+ dataset_type: physical_embodiment
17
+ embodiment_tag: libero_panda
18
+ mix_ratio: 1.0
19
+ val_dataset_path: null
20
+ video_backend: torchcodec
21
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
22
+ dataset_paths:
23
+ - /workspace/gr00t/examples/LIBERO/libero_object_no_noops_1.0.0_lerobot
24
+ dataset_type: physical_embodiment
25
+ embodiment_tag: libero_panda
26
+ mix_ratio: 1.0
27
+ val_dataset_path: null
28
+ video_backend: torchcodec
29
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
30
+ dataset_paths:
31
+ - /workspace/gr00t/examples/LIBERO/libero_spatial_no_noops_1.0.0_lerobot
32
+ dataset_type: physical_embodiment
33
+ embodiment_tag: libero_panda
34
+ mix_ratio: 1.0
35
+ val_dataset_path: null
36
+ video_backend: torchcodec
37
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
38
+ dataset_paths:
39
+ - /workspace/gr00t/examples/SimplerEnv/bridge_orig_lerobot/
40
+ dataset_type: physical_embodiment
41
+ embodiment_tag: oxe_widowx
42
+ mix_ratio: 1.0
43
+ val_dataset_path: null
44
+ video_backend: torchcodec
45
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
46
+ dataset_paths:
47
+ - /workspace/gr00t/examples/SimplerEnv/fractal20220817_data_lerobot/
48
+ dataset_type: physical_embodiment
49
+ embodiment_tag: oxe_google
50
+ mix_ratio: 1.0
51
+ val_dataset_path: null
52
+ video_backend: torchcodec
53
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
54
+ dataset_paths:
55
+ - /workspace/gr00t/examples/GR00T-WholeBodyControl/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim/unitree_g1.LMPnPAppleToPlateDC/
56
+ dataset_type: physical_embodiment
57
+ embodiment_tag: unitree_g1
58
+ mix_ratio: 1.0
59
+ val_dataset_path: null
60
+ video_backend: torchcodec
61
+ download_cache: false
62
+ episode_sampling_rate: 0.1
63
+ image_crop_size:
64
+ - 244
65
+ - 244
66
+ image_target_size:
67
+ - 224
68
+ - 224
69
+ mock_dataset_mode: false
70
+ modality_configs:
71
+ libero_panda:
72
+ action: !!python/object:gr00t.data.types.ModalityConfig
73
+ action_configs:
74
+ - &id001 !!python/object:gr00t.data.types.ActionConfig
75
+ format: &id002 !!python/object/apply:gr00t.data.types.ActionFormat
76
+ - default
77
+ rep: &id003 !!python/object/apply:gr00t.data.types.ActionRepresentation
78
+ - absolute
79
+ state_key: null
80
+ type: &id004 !!python/object/apply:gr00t.data.types.ActionType
81
+ - non_eef
82
+ - *id001
83
+ - *id001
84
+ - *id001
85
+ - *id001
86
+ - *id001
87
+ - *id001
88
+ delta_indices:
89
+ - 0
90
+ - 1
91
+ - 2
92
+ - 3
93
+ - 4
94
+ - 5
95
+ - 6
96
+ - 7
97
+ - 8
98
+ - 9
99
+ - 10
100
+ - 11
101
+ - 12
102
+ - 13
103
+ - 14
104
+ - 15
105
+ mean_std_embedding_keys: null
106
+ modality_keys:
107
+ - x
108
+ - y
109
+ - z
110
+ - roll
111
+ - pitch
112
+ - yaw
113
+ - gripper
114
+ sin_cos_embedding_keys: null
115
+ language: !!python/object:gr00t.data.types.ModalityConfig
116
+ action_configs: null
117
+ delta_indices:
118
+ - 0
119
+ mean_std_embedding_keys: null
120
+ modality_keys:
121
+ - annotation.human.action.task_description
122
+ sin_cos_embedding_keys: null
123
+ state: !!python/object:gr00t.data.types.ModalityConfig
124
+ action_configs: null
125
+ delta_indices:
126
+ - 0
127
+ mean_std_embedding_keys: null
128
+ modality_keys:
129
+ - x
130
+ - y
131
+ - z
132
+ - roll
133
+ - pitch
134
+ - yaw
135
+ - gripper
136
+ sin_cos_embedding_keys: null
137
+ video: !!python/object:gr00t.data.types.ModalityConfig
138
+ action_configs: null
139
+ delta_indices:
140
+ - 0
141
+ mean_std_embedding_keys: null
142
+ modality_keys:
143
+ - image
144
+ - wrist_image
145
+ sin_cos_embedding_keys: null
146
+ oxe_google:
147
+ action: !!python/object:gr00t.data.types.ModalityConfig
148
+ action_configs:
149
+ - &id005 !!python/object:gr00t.data.types.ActionConfig
150
+ format: *id002
151
+ rep: *id003
152
+ state_key: null
153
+ type: *id004
154
+ - *id005
155
+ - *id005
156
+ - *id005
157
+ - *id005
158
+ - *id005
159
+ - *id005
160
+ delta_indices:
161
+ - 0
162
+ - 1
163
+ - 2
164
+ - 3
165
+ - 4
166
+ - 5
167
+ - 6
168
+ - 7
169
+ mean_std_embedding_keys:
170
+ - x
171
+ - y
172
+ - z
173
+ - roll
174
+ - pitch
175
+ - yaw
176
+ modality_keys:
177
+ - x
178
+ - y
179
+ - z
180
+ - roll
181
+ - pitch
182
+ - yaw
183
+ - gripper
184
+ sin_cos_embedding_keys: null
185
+ language: !!python/object:gr00t.data.types.ModalityConfig
186
+ action_configs: null
187
+ delta_indices:
188
+ - 0
189
+ mean_std_embedding_keys: null
190
+ modality_keys:
191
+ - annotation.human.action.task_description
192
+ sin_cos_embedding_keys: null
193
+ state: !!python/object:gr00t.data.types.ModalityConfig
194
+ action_configs: null
195
+ delta_indices:
196
+ - 0
197
+ mean_std_embedding_keys: null
198
+ modality_keys:
199
+ - x
200
+ - y
201
+ - z
202
+ - rx
203
+ - ry
204
+ - rz
205
+ - rw
206
+ - gripper
207
+ sin_cos_embedding_keys: null
208
+ video: !!python/object:gr00t.data.types.ModalityConfig
209
+ action_configs: null
210
+ delta_indices:
211
+ - 0
212
+ mean_std_embedding_keys: null
213
+ modality_keys:
214
+ - image
215
+ sin_cos_embedding_keys: null
216
+ oxe_widowx:
217
+ action: !!python/object:gr00t.data.types.ModalityConfig
218
+ action_configs:
219
+ - &id006 !!python/object:gr00t.data.types.ActionConfig
220
+ format: *id002
221
+ rep: *id003
222
+ state_key: null
223
+ type: *id004
224
+ - *id006
225
+ - *id006
226
+ - *id006
227
+ - *id006
228
+ - *id006
229
+ - *id006
230
+ delta_indices:
231
+ - 0
232
+ - 1
233
+ - 2
234
+ - 3
235
+ - 4
236
+ - 5
237
+ - 6
238
+ - 7
239
+ mean_std_embedding_keys:
240
+ - x
241
+ - y
242
+ - z
243
+ - roll
244
+ - pitch
245
+ - yaw
246
+ modality_keys:
247
+ - x
248
+ - y
249
+ - z
250
+ - roll
251
+ - pitch
252
+ - yaw
253
+ - gripper
254
+ sin_cos_embedding_keys: null
255
+ language: !!python/object:gr00t.data.types.ModalityConfig
256
+ action_configs: null
257
+ delta_indices:
258
+ - 0
259
+ mean_std_embedding_keys: null
260
+ modality_keys:
261
+ - annotation.human.action.task_description
262
+ sin_cos_embedding_keys: null
263
+ state: !!python/object:gr00t.data.types.ModalityConfig
264
+ action_configs: null
265
+ delta_indices:
266
+ - 0
267
+ mean_std_embedding_keys: null
268
+ modality_keys:
269
+ - x
270
+ - y
271
+ - z
272
+ - roll
273
+ - pitch
274
+ - yaw
275
+ - pad
276
+ - gripper
277
+ sin_cos_embedding_keys: null
278
+ video: !!python/object:gr00t.data.types.ModalityConfig
279
+ action_configs: null
280
+ delta_indices:
281
+ - 0
282
+ mean_std_embedding_keys: null
283
+ modality_keys:
284
+ - image_0
285
+ sin_cos_embedding_keys: null
286
+ unitree_g1:
287
+ action: !!python/object:gr00t.data.types.ModalityConfig
288
+ action_configs:
289
+ - !!python/object:gr00t.data.types.ActionConfig
290
+ format: *id002
291
+ rep: &id007 !!python/object/apply:gr00t.data.types.ActionRepresentation
292
+ - relative
293
+ state_key: null
294
+ type: *id004
295
+ - !!python/object:gr00t.data.types.ActionConfig
296
+ format: *id002
297
+ rep: *id007
298
+ state_key: null
299
+ type: *id004
300
+ - !!python/object:gr00t.data.types.ActionConfig
301
+ format: *id002
302
+ rep: *id003
303
+ state_key: null
304
+ type: *id004
305
+ - !!python/object:gr00t.data.types.ActionConfig
306
+ format: *id002
307
+ rep: *id003
308
+ state_key: null
309
+ type: *id004
310
+ - !!python/object:gr00t.data.types.ActionConfig
311
+ format: *id002
312
+ rep: *id003
313
+ state_key: null
314
+ type: *id004
315
+ - !!python/object:gr00t.data.types.ActionConfig
316
+ format: *id002
317
+ rep: *id003
318
+ state_key: null
319
+ type: *id004
320
+ - !!python/object:gr00t.data.types.ActionConfig
321
+ format: *id002
322
+ rep: *id003
323
+ state_key: null
324
+ type: *id004
325
+ delta_indices:
326
+ - 0
327
+ - 1
328
+ - 2
329
+ - 3
330
+ - 4
331
+ - 5
332
+ - 6
333
+ - 7
334
+ - 8
335
+ - 9
336
+ - 10
337
+ - 11
338
+ - 12
339
+ - 13
340
+ - 14
341
+ - 15
342
+ - 16
343
+ - 17
344
+ - 18
345
+ - 19
346
+ - 20
347
+ - 21
348
+ - 22
349
+ - 23
350
+ - 24
351
+ - 25
352
+ - 26
353
+ - 27
354
+ - 28
355
+ - 29
356
+ mean_std_embedding_keys: null
357
+ modality_keys:
358
+ - left_arm
359
+ - right_arm
360
+ - left_hand
361
+ - right_hand
362
+ - waist
363
+ - base_height_command
364
+ - navigate_command
365
+ sin_cos_embedding_keys: null
366
+ language: !!python/object:gr00t.data.types.ModalityConfig
367
+ action_configs: null
368
+ delta_indices:
369
+ - 0
370
+ mean_std_embedding_keys: null
371
+ modality_keys:
372
+ - annotation.human.task_description
373
+ sin_cos_embedding_keys: null
374
+ state: !!python/object:gr00t.data.types.ModalityConfig
375
+ action_configs: null
376
+ delta_indices:
377
+ - 0
378
+ mean_std_embedding_keys: null
379
+ modality_keys:
380
+ - left_leg
381
+ - right_leg
382
+ - waist
383
+ - left_arm
384
+ - right_arm
385
+ - left_hand
386
+ - right_hand
387
+ sin_cos_embedding_keys: null
388
+ video: !!python/object:gr00t.data.types.ModalityConfig
389
+ action_configs: null
390
+ delta_indices:
391
+ - 0
392
+ mean_std_embedding_keys: null
393
+ modality_keys:
394
+ - ego_view
395
+ sin_cos_embedding_keys: null
396
+ mode: scenario
397
+ multiprocessing_context: fork
398
+ num_shards_per_epoch: 100000
399
+ override_pretraining_statistics: false
400
+ random_chop: 0.0
401
+ seed: 42
402
+ shard_size: 1024
403
+ shuffle: true
404
+ subsample_ratio: 1.0
405
+ video_backend: torchcodec
406
+ experiment: gr00t.experiment_configs.stage_1.Stage1Experiment
407
+ load_config_path: null
408
+ model: !!python/object:gr00t.configs.model.gr00t_n1d6.MemoryGr00tN1d6Config
409
+ _attn_implementation_autoset: false
410
+ _attn_implementation_internal: null
411
+ _commit_hash: null
412
+ _name_or_path: ''
413
+ _output_attentions: false
414
+ add_bos_embedding: true
415
+ add_cross_attention: false
416
+ architectures: null
417
+ backbone_model_type: eagle
418
+ backbone_trainable_params_fp32: true
419
+ bad_words_ids: null
420
+ begin_suppress_tokens: null
421
+ block_size: 256
422
+ bos_token_id: null
423
+ chunk_size_feed_forward: 0
424
+ color_jitter_params: null
425
+ cross_attention_hidden_size: null
426
+ decay_frequency: 1
427
+ decoder_start_token_id: null
428
+ diffusion_model_cfg:
429
+ attention_head_dim: 48
430
+ dropout: 0.2
431
+ final_dropout: true
432
+ interleave_self_attention: true
433
+ norm_type: ada_norm
434
+ num_attention_heads: 32
435
+ num_layers: 32
436
+ output_dim: 1024
437
+ positional_embeddings: null
438
+ diversity_penalty: 0.0
439
+ do_sample: false
440
+ drop_ltm_memory: true
441
+ eagle_collator: true
442
+ early_stopping: false
443
+ encoder_no_repeat_ngram_size: 0
444
+ eos_token_id: null
445
+ exponential_decay_length_penalty: null
446
+ finetuning_task: null
447
+ forced_bos_token_id: null
448
+ forced_eos_token_id: null
449
+ id2label:
450
+ 0: LABEL_0
451
+ 1: LABEL_1
452
+ initial_recall_frequency: 10
453
+ is_decoder: false
454
+ is_encoder_decoder: false
455
+ label2id:
456
+ LABEL_0: 0
457
+ LABEL_1: 1
458
+ length_penalty: 1.0
459
+ load_bf16: false
460
+ ltm_retrieve_blocks: 10
461
+ max_length: 20
462
+ min_length: 0
463
+ model_name: nvidia/Eagle-Block2A-2B-v2
464
+ no_repeat_ngram_size: 0
465
+ num_beam_groups: 1
466
+ num_beams: 1
467
+ num_ltm_blocks: 10
468
+ num_return_sequences: 1
469
+ num_selector_layers: 2
470
+ num_total_blocks: 50
471
+ output_hidden_states: false
472
+ output_scores: false
473
+ pad_token_id: null
474
+ prefix: null
475
+ problem_type: null
476
+ pruned_heads: {}
477
+ random_retriever_length: false
478
+ random_rotation_angle: null
479
+ remove_invalid_values: false
480
+ repetition_penalty: 1.0
481
+ reproject_vision: false
482
+ return_dict: true
483
+ return_dict_in_generate: false
484
+ selector_hidden_dim: 256
485
+ sep_token_id: null
486
+ state_dropout_prob: 0.0
487
+ suppress_tokens: null
488
+ task_specific_params: null
489
+ temperature: 1.0
490
+ tf_legacy_loss: false
491
+ tie_encoder_decoder: false
492
+ tie_word_embeddings: true
493
+ tokenizer_class: null
494
+ top_k: 50
495
+ top_p: 1.0
496
+ torch_dtype: null
497
+ torchscript: false
498
+ transformers_version: null
499
+ tune_diffusion_model: false
500
+ tune_enc_query_proj: false
501
+ tune_key_proj: false
502
+ tune_llm: false
503
+ tune_memory_bos_embedding: false
504
+ tune_memory_loras: false
505
+ tune_memory_pos_embeddings: false
506
+ tune_projector: true
507
+ tune_query_proj: false
508
+ tune_top_llm_layers: 0
509
+ tune_visual: false
510
+ tune_vlln: false
511
+ typical_p: 1.0
512
+ use_bfloat16: false
513
+ use_relative_action: true
514
+ scenario: gr00t.experiment_configs.stage_1.Stage1Dataset
515
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
516
+ add_rl_callback: false
517
+ assert_loss_less_than: null
518
+ batch_size: null
519
+ bf16: true
520
+ dataloader_num_workers: 9
521
+ ddp_bucket_cap_mb: 100
522
+ deepspeed_stage: 2
523
+ enable_open_loop_eval: false
524
+ enable_profiling: false
525
+ eval_batch_size: 2
526
+ eval_bf16: true
527
+ eval_set_split_ratio: 0.1
528
+ eval_steps: 500
529
+ eval_strategy: 'no'
530
+ experiment_name: null
531
+ fp16: false
532
+ global_batch_size: 12
533
+ gradient_accumulation_steps: 1
534
+ gradient_checkpointing: false
535
+ learning_rate: 0.0001
536
+ logging_steps: 10
537
+ lr_scheduler_type: cosine
538
+ max_concurrent_uploads: 2
539
+ max_grad_norm: 1.0
540
+ max_retries: 3
541
+ max_steps: 20000
542
+ num_gpus: 1
543
+ open_loop_eval_plot_indices: null
544
+ open_loop_eval_steps_per_traj: 100
545
+ open_loop_eval_traj_ids:
546
+ - 0
547
+ optim: adamw_torch_fused
548
+ output_dir: models/attempt_2/stage_1_2/
549
+ remove_unused_columns: false
550
+ save_best_eval_metric_greater_is_better: true
551
+ save_best_eval_metric_name: ''
552
+ save_steps: 2000
553
+ save_total_limit: 5
554
+ save_vl_model: false
555
+ start_from_checkpoint: models/attempt_2/stage_1/checkpoint-12000/
556
+ tf32: true
557
+ transformers_access_token: null
558
+ transformers_cache_dir: null
559
+ transformers_local_files_only: false
560
+ transformers_trust_remote_code: true
561
+ upload_checkpoints: false
562
+ upload_every: 1000
563
+ upload_last_n_checkpoints: 5
564
+ use_ddp: false
565
+ use_wandb: true
566
+ wandb_project: finetune-gr00t-n1d6
567
+ warmup_ratio: 0.05
568
+ warmup_steps: 0
569
+ weight_decay: 1.0e-05
experiment_cfg/dataset_statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d6",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
+ "backbone_model_type": "eagle",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 4,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": true,
15
+ "collator_overwrite_image_inputs": false,
16
+ "eagle_collator": true,
17
+ "backbone_trainable_params_fp32": true,
18
+ "apply_sincos_state_encoding": true,
19
+ "use_relative_action": true,
20
+ "max_state_dim": 128,
21
+ "max_action_dim": 128,
22
+ "action_horizon": 50,
23
+ "hidden_size": 1024,
24
+ "input_embedding_dim": 1536,
25
+ "add_pos_embed": true,
26
+ "attn_dropout": 0.2,
27
+ "use_vlln": true,
28
+ "max_seq_len": 1024,
29
+ "use_alternate_vl_dit": true,
30
+ "attend_text_every_n_blocks": 2,
31
+ "diffusion_model_cfg": {
32
+ "attention_head_dim": 48,
33
+ "dropout": 0.2,
34
+ "final_dropout": true,
35
+ "interleave_self_attention": true,
36
+ "norm_type": "ada_norm",
37
+ "num_attention_heads": 32,
38
+ "num_layers": 32,
39
+ "output_dim": 1024,
40
+ "positional_embeddings": null
41
+ },
42
+ "num_inference_timesteps": 4,
43
+ "noise_beta_alpha": 1.5,
44
+ "noise_beta_beta": 1.0,
45
+ "noise_s": 0.999,
46
+ "num_timestep_buckets": 1000,
47
+ "tune_projector": true,
48
+ "tune_diffusion_model": false,
49
+ "tune_vlln": false,
50
+ "state_dropout_prob": 0.0,
51
+ "state_additive_noise_scale": 0.0,
52
+ "max_num_embodiments": 32
53
+ }
experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad263dbb9b0d0da51b35ecf98514bfa31f9bd4ce63b575600f9a552a4974493f
3
+ size 4992611704
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cf480f5bf73530087d71890cf7a052f36cc564b771f327892967202d19a6bdf
3
+ size 4385706632
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
processor/embodiment_id.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "new_embodiment": 10
10
+ }
processor/processor_config.json ADDED
@@ -0,0 +1,857 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6ScenarioProcessor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "action_configs": null
18
+ },
19
+ "state": {
20
+ "delta_indices": [
21
+ 0
22
+ ],
23
+ "modality_keys": [
24
+ "robot_pos",
25
+ "robot_ori_cos",
26
+ "robot_ori_sin",
27
+ "robot_2d_ori",
28
+ "robot_2d_ori_cos",
29
+ "robot_2d_ori_sin",
30
+ "robot_lin_vel",
31
+ "robot_ang_vel",
32
+ "arm_left_qpos",
33
+ "arm_left_qpos_sin",
34
+ "arm_left_qpos_cos",
35
+ "eef_left_pos",
36
+ "eef_left_quat",
37
+ "gripper_left_qpos",
38
+ "arm_right_qpos",
39
+ "arm_right_qpos_sin",
40
+ "arm_right_qpos_cos",
41
+ "eef_right_pos",
42
+ "eef_right_quat",
43
+ "gripper_right_qpos",
44
+ "trunk_qpos"
45
+ ],
46
+ "sin_cos_embedding_keys": null,
47
+ "mean_std_embedding_keys": null,
48
+ "action_configs": null
49
+ },
50
+ "action": {
51
+ "delta_indices": [
52
+ 0,
53
+ 1,
54
+ 2,
55
+ 3,
56
+ 4,
57
+ 5,
58
+ 6,
59
+ 7,
60
+ 8,
61
+ 9,
62
+ 10,
63
+ 11,
64
+ 12,
65
+ 13,
66
+ 14,
67
+ 15,
68
+ 16,
69
+ 17,
70
+ 18,
71
+ 19,
72
+ 20,
73
+ 21,
74
+ 22,
75
+ 23,
76
+ 24,
77
+ 25,
78
+ 26,
79
+ 27,
80
+ 28,
81
+ 29,
82
+ 30,
83
+ 31
84
+ ],
85
+ "modality_keys": [
86
+ "base",
87
+ "torso",
88
+ "left_arm",
89
+ "left_gripper",
90
+ "right_arm",
91
+ "right_gripper"
92
+ ],
93
+ "sin_cos_embedding_keys": null,
94
+ "mean_std_embedding_keys": null,
95
+ "action_configs": [
96
+ {
97
+ "rep": "ABSOLUTE",
98
+ "type": "NON_EEF",
99
+ "format": "DEFAULT",
100
+ "state_key": null
101
+ },
102
+ {
103
+ "rep": "RELATIVE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": "trunk_qpos"
107
+ },
108
+ {
109
+ "rep": "RELATIVE",
110
+ "type": "NON_EEF",
111
+ "format": "DEFAULT",
112
+ "state_key": "arm_left_qpos"
113
+ },
114
+ {
115
+ "rep": "ABSOLUTE",
116
+ "type": "NON_EEF",
117
+ "format": "DEFAULT",
118
+ "state_key": null
119
+ },
120
+ {
121
+ "rep": "RELATIVE",
122
+ "type": "NON_EEF",
123
+ "format": "DEFAULT",
124
+ "state_key": "arm_right_qpos"
125
+ },
126
+ {
127
+ "rep": "ABSOLUTE",
128
+ "type": "NON_EEF",
129
+ "format": "DEFAULT",
130
+ "state_key": null
131
+ }
132
+ ]
133
+ },
134
+ "language": {
135
+ "delta_indices": [
136
+ 0
137
+ ],
138
+ "modality_keys": [
139
+ "annotation.human.coarse_action"
140
+ ],
141
+ "sin_cos_embedding_keys": null,
142
+ "mean_std_embedding_keys": null,
143
+ "action_configs": null
144
+ }
145
+ },
146
+ "gr1": {
147
+ "video": {
148
+ "delta_indices": [
149
+ 0
150
+ ],
151
+ "modality_keys": [
152
+ "ego_view_bg_crop_pad_res256_freq20"
153
+ ],
154
+ "sin_cos_embedding_keys": null,
155
+ "mean_std_embedding_keys": null,
156
+ "action_configs": null
157
+ },
158
+ "state": {
159
+ "delta_indices": [
160
+ 0
161
+ ],
162
+ "modality_keys": [
163
+ "left_arm",
164
+ "right_arm",
165
+ "left_hand",
166
+ "right_hand",
167
+ "waist"
168
+ ],
169
+ "sin_cos_embedding_keys": [
170
+ "left_arm",
171
+ "right_arm",
172
+ "left_hand",
173
+ "right_hand",
174
+ "waist"
175
+ ],
176
+ "mean_std_embedding_keys": null,
177
+ "action_configs": null
178
+ },
179
+ "action": {
180
+ "delta_indices": [
181
+ 0,
182
+ 1,
183
+ 2,
184
+ 3,
185
+ 4,
186
+ 5,
187
+ 6,
188
+ 7,
189
+ 8,
190
+ 9,
191
+ 10,
192
+ 11,
193
+ 12,
194
+ 13,
195
+ 14,
196
+ 15
197
+ ],
198
+ "modality_keys": [
199
+ "left_arm",
200
+ "right_arm",
201
+ "left_hand",
202
+ "right_hand",
203
+ "waist"
204
+ ],
205
+ "sin_cos_embedding_keys": null,
206
+ "mean_std_embedding_keys": null,
207
+ "action_configs": [
208
+ {
209
+ "rep": "RELATIVE",
210
+ "type": "NON_EEF",
211
+ "format": "DEFAULT",
212
+ "state_key": null
213
+ },
214
+ {
215
+ "rep": "RELATIVE",
216
+ "type": "NON_EEF",
217
+ "format": "DEFAULT",
218
+ "state_key": null
219
+ },
220
+ {
221
+ "rep": "RELATIVE",
222
+ "type": "NON_EEF",
223
+ "format": "DEFAULT",
224
+ "state_key": null
225
+ },
226
+ {
227
+ "rep": "RELATIVE",
228
+ "type": "NON_EEF",
229
+ "format": "DEFAULT",
230
+ "state_key": null
231
+ },
232
+ {
233
+ "rep": "ABSOLUTE",
234
+ "type": "NON_EEF",
235
+ "format": "DEFAULT",
236
+ "state_key": null
237
+ }
238
+ ]
239
+ },
240
+ "language": {
241
+ "delta_indices": [
242
+ 0
243
+ ],
244
+ "modality_keys": [
245
+ "task"
246
+ ],
247
+ "sin_cos_embedding_keys": null,
248
+ "mean_std_embedding_keys": null,
249
+ "action_configs": null
250
+ }
251
+ },
252
+ "robocasa_panda_omron": {
253
+ "video": {
254
+ "delta_indices": [
255
+ 0
256
+ ],
257
+ "modality_keys": [
258
+ "res256_image_side_0",
259
+ "res256_image_side_1",
260
+ "res256_image_wrist_0"
261
+ ],
262
+ "sin_cos_embedding_keys": null,
263
+ "mean_std_embedding_keys": null,
264
+ "action_configs": null
265
+ },
266
+ "state": {
267
+ "delta_indices": [
268
+ 0
269
+ ],
270
+ "modality_keys": [
271
+ "end_effector_position_relative",
272
+ "end_effector_rotation_relative",
273
+ "gripper_qpos",
274
+ "base_position",
275
+ "base_rotation"
276
+ ],
277
+ "sin_cos_embedding_keys": null,
278
+ "mean_std_embedding_keys": null,
279
+ "action_configs": null
280
+ },
281
+ "action": {
282
+ "delta_indices": [
283
+ 0,
284
+ 1,
285
+ 2,
286
+ 3,
287
+ 4,
288
+ 5,
289
+ 6,
290
+ 7,
291
+ 8,
292
+ 9,
293
+ 10,
294
+ 11,
295
+ 12,
296
+ 13,
297
+ 14,
298
+ 15
299
+ ],
300
+ "modality_keys": [
301
+ "end_effector_position",
302
+ "end_effector_rotation",
303
+ "gripper_close",
304
+ "base_motion",
305
+ "control_mode"
306
+ ],
307
+ "sin_cos_embedding_keys": null,
308
+ "mean_std_embedding_keys": null,
309
+ "action_configs": [
310
+ {
311
+ "rep": "ABSOLUTE",
312
+ "type": "NON_EEF",
313
+ "format": "DEFAULT",
314
+ "state_key": null
315
+ },
316
+ {
317
+ "rep": "ABSOLUTE",
318
+ "type": "NON_EEF",
319
+ "format": "DEFAULT",
320
+ "state_key": null
321
+ },
322
+ {
323
+ "rep": "ABSOLUTE",
324
+ "type": "NON_EEF",
325
+ "format": "DEFAULT",
326
+ "state_key": null
327
+ },
328
+ {
329
+ "rep": "ABSOLUTE",
330
+ "type": "NON_EEF",
331
+ "format": "DEFAULT",
332
+ "state_key": null
333
+ },
334
+ {
335
+ "rep": "ABSOLUTE",
336
+ "type": "NON_EEF",
337
+ "format": "DEFAULT",
338
+ "state_key": null
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "annotation.human.action.task_description"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "action_configs": null
352
+ }
353
+ },
354
+ "libero_panda": {
355
+ "video": {
356
+ "delta_indices": [
357
+ 0
358
+ ],
359
+ "modality_keys": [
360
+ "image",
361
+ "wrist_image"
362
+ ],
363
+ "sin_cos_embedding_keys": null,
364
+ "mean_std_embedding_keys": null,
365
+ "action_configs": null
366
+ },
367
+ "state": {
368
+ "delta_indices": [
369
+ 0
370
+ ],
371
+ "modality_keys": [
372
+ "x",
373
+ "y",
374
+ "z",
375
+ "roll",
376
+ "pitch",
377
+ "yaw",
378
+ "gripper"
379
+ ],
380
+ "sin_cos_embedding_keys": null,
381
+ "mean_std_embedding_keys": null,
382
+ "action_configs": null
383
+ },
384
+ "action": {
385
+ "delta_indices": [
386
+ 0,
387
+ 1,
388
+ 2,
389
+ 3,
390
+ 4,
391
+ 5,
392
+ 6,
393
+ 7,
394
+ 8,
395
+ 9,
396
+ 10,
397
+ 11,
398
+ 12,
399
+ 13,
400
+ 14,
401
+ 15
402
+ ],
403
+ "modality_keys": [
404
+ "x",
405
+ "y",
406
+ "z",
407
+ "roll",
408
+ "pitch",
409
+ "yaw",
410
+ "gripper"
411
+ ],
412
+ "sin_cos_embedding_keys": null,
413
+ "mean_std_embedding_keys": null,
414
+ "action_configs": [
415
+ {
416
+ "rep": "ABSOLUTE",
417
+ "type": "NON_EEF",
418
+ "format": "DEFAULT",
419
+ "state_key": null
420
+ },
421
+ {
422
+ "rep": "ABSOLUTE",
423
+ "type": "NON_EEF",
424
+ "format": "DEFAULT",
425
+ "state_key": null
426
+ },
427
+ {
428
+ "rep": "ABSOLUTE",
429
+ "type": "NON_EEF",
430
+ "format": "DEFAULT",
431
+ "state_key": null
432
+ },
433
+ {
434
+ "rep": "ABSOLUTE",
435
+ "type": "NON_EEF",
436
+ "format": "DEFAULT",
437
+ "state_key": null
438
+ },
439
+ {
440
+ "rep": "ABSOLUTE",
441
+ "type": "NON_EEF",
442
+ "format": "DEFAULT",
443
+ "state_key": null
444
+ },
445
+ {
446
+ "rep": "ABSOLUTE",
447
+ "type": "NON_EEF",
448
+ "format": "DEFAULT",
449
+ "state_key": null
450
+ },
451
+ {
452
+ "rep": "ABSOLUTE",
453
+ "type": "NON_EEF",
454
+ "format": "DEFAULT",
455
+ "state_key": null
456
+ }
457
+ ]
458
+ },
459
+ "language": {
460
+ "delta_indices": [
461
+ 0
462
+ ],
463
+ "modality_keys": [
464
+ "annotation.human.action.task_description"
465
+ ],
466
+ "sin_cos_embedding_keys": null,
467
+ "mean_std_embedding_keys": null,
468
+ "action_configs": null
469
+ }
470
+ },
471
+ "unitree_g1": {
472
+ "video": {
473
+ "delta_indices": [
474
+ 0
475
+ ],
476
+ "modality_keys": [
477
+ "ego_view"
478
+ ],
479
+ "sin_cos_embedding_keys": null,
480
+ "mean_std_embedding_keys": null,
481
+ "action_configs": null
482
+ },
483
+ "state": {
484
+ "delta_indices": [
485
+ 0
486
+ ],
487
+ "modality_keys": [
488
+ "left_leg",
489
+ "right_leg",
490
+ "waist",
491
+ "left_arm",
492
+ "right_arm",
493
+ "left_hand",
494
+ "right_hand"
495
+ ],
496
+ "sin_cos_embedding_keys": null,
497
+ "mean_std_embedding_keys": null,
498
+ "action_configs": null
499
+ },
500
+ "action": {
501
+ "delta_indices": [
502
+ 0,
503
+ 1,
504
+ 2,
505
+ 3,
506
+ 4,
507
+ 5,
508
+ 6,
509
+ 7,
510
+ 8,
511
+ 9,
512
+ 10,
513
+ 11,
514
+ 12,
515
+ 13,
516
+ 14,
517
+ 15,
518
+ 16,
519
+ 17,
520
+ 18,
521
+ 19,
522
+ 20,
523
+ 21,
524
+ 22,
525
+ 23,
526
+ 24,
527
+ 25,
528
+ 26,
529
+ 27,
530
+ 28,
531
+ 29
532
+ ],
533
+ "modality_keys": [
534
+ "left_arm",
535
+ "right_arm",
536
+ "left_hand",
537
+ "right_hand",
538
+ "waist",
539
+ "base_height_command",
540
+ "navigate_command"
541
+ ],
542
+ "sin_cos_embedding_keys": null,
543
+ "mean_std_embedding_keys": null,
544
+ "action_configs": [
545
+ {
546
+ "rep": "RELATIVE",
547
+ "type": "NON_EEF",
548
+ "format": "DEFAULT",
549
+ "state_key": null
550
+ },
551
+ {
552
+ "rep": "RELATIVE",
553
+ "type": "NON_EEF",
554
+ "format": "DEFAULT",
555
+ "state_key": null
556
+ },
557
+ {
558
+ "rep": "ABSOLUTE",
559
+ "type": "NON_EEF",
560
+ "format": "DEFAULT",
561
+ "state_key": null
562
+ },
563
+ {
564
+ "rep": "ABSOLUTE",
565
+ "type": "NON_EEF",
566
+ "format": "DEFAULT",
567
+ "state_key": null
568
+ },
569
+ {
570
+ "rep": "ABSOLUTE",
571
+ "type": "NON_EEF",
572
+ "format": "DEFAULT",
573
+ "state_key": null
574
+ },
575
+ {
576
+ "rep": "ABSOLUTE",
577
+ "type": "NON_EEF",
578
+ "format": "DEFAULT",
579
+ "state_key": null
580
+ },
581
+ {
582
+ "rep": "ABSOLUTE",
583
+ "type": "NON_EEF",
584
+ "format": "DEFAULT",
585
+ "state_key": null
586
+ }
587
+ ]
588
+ },
589
+ "language": {
590
+ "delta_indices": [
591
+ 0
592
+ ],
593
+ "modality_keys": [
594
+ "annotation.human.task_description"
595
+ ],
596
+ "sin_cos_embedding_keys": null,
597
+ "mean_std_embedding_keys": null,
598
+ "action_configs": null
599
+ }
600
+ },
601
+ "oxe_widowx": {
602
+ "video": {
603
+ "delta_indices": [
604
+ 0
605
+ ],
606
+ "modality_keys": [
607
+ "image_0"
608
+ ],
609
+ "sin_cos_embedding_keys": null,
610
+ "mean_std_embedding_keys": null,
611
+ "action_configs": null
612
+ },
613
+ "state": {
614
+ "delta_indices": [
615
+ 0
616
+ ],
617
+ "modality_keys": [
618
+ "x",
619
+ "y",
620
+ "z",
621
+ "roll",
622
+ "pitch",
623
+ "yaw",
624
+ "pad",
625
+ "gripper"
626
+ ],
627
+ "sin_cos_embedding_keys": null,
628
+ "mean_std_embedding_keys": null,
629
+ "action_configs": null
630
+ },
631
+ "action": {
632
+ "delta_indices": [
633
+ 0,
634
+ 1,
635
+ 2,
636
+ 3,
637
+ 4,
638
+ 5,
639
+ 6,
640
+ 7
641
+ ],
642
+ "modality_keys": [
643
+ "x",
644
+ "y",
645
+ "z",
646
+ "roll",
647
+ "pitch",
648
+ "yaw",
649
+ "gripper"
650
+ ],
651
+ "sin_cos_embedding_keys": null,
652
+ "mean_std_embedding_keys": [
653
+ "x",
654
+ "y",
655
+ "z",
656
+ "roll",
657
+ "pitch",
658
+ "yaw"
659
+ ],
660
+ "action_configs": [
661
+ {
662
+ "rep": "ABSOLUTE",
663
+ "type": "NON_EEF",
664
+ "format": "DEFAULT",
665
+ "state_key": null
666
+ },
667
+ {
668
+ "rep": "ABSOLUTE",
669
+ "type": "NON_EEF",
670
+ "format": "DEFAULT",
671
+ "state_key": null
672
+ },
673
+ {
674
+ "rep": "ABSOLUTE",
675
+ "type": "NON_EEF",
676
+ "format": "DEFAULT",
677
+ "state_key": null
678
+ },
679
+ {
680
+ "rep": "ABSOLUTE",
681
+ "type": "NON_EEF",
682
+ "format": "DEFAULT",
683
+ "state_key": null
684
+ },
685
+ {
686
+ "rep": "ABSOLUTE",
687
+ "type": "NON_EEF",
688
+ "format": "DEFAULT",
689
+ "state_key": null
690
+ },
691
+ {
692
+ "rep": "ABSOLUTE",
693
+ "type": "NON_EEF",
694
+ "format": "DEFAULT",
695
+ "state_key": null
696
+ },
697
+ {
698
+ "rep": "ABSOLUTE",
699
+ "type": "NON_EEF",
700
+ "format": "DEFAULT",
701
+ "state_key": null
702
+ }
703
+ ]
704
+ },
705
+ "language": {
706
+ "delta_indices": [
707
+ 0
708
+ ],
709
+ "modality_keys": [
710
+ "annotation.human.action.task_description"
711
+ ],
712
+ "sin_cos_embedding_keys": null,
713
+ "mean_std_embedding_keys": null,
714
+ "action_configs": null
715
+ }
716
+ },
717
+ "oxe_google": {
718
+ "video": {
719
+ "delta_indices": [
720
+ 0
721
+ ],
722
+ "modality_keys": [
723
+ "image"
724
+ ],
725
+ "sin_cos_embedding_keys": null,
726
+ "mean_std_embedding_keys": null,
727
+ "action_configs": null
728
+ },
729
+ "state": {
730
+ "delta_indices": [
731
+ 0
732
+ ],
733
+ "modality_keys": [
734
+ "x",
735
+ "y",
736
+ "z",
737
+ "rx",
738
+ "ry",
739
+ "rz",
740
+ "rw",
741
+ "gripper"
742
+ ],
743
+ "sin_cos_embedding_keys": null,
744
+ "mean_std_embedding_keys": null,
745
+ "action_configs": null
746
+ },
747
+ "action": {
748
+ "delta_indices": [
749
+ 0,
750
+ 1,
751
+ 2,
752
+ 3,
753
+ 4,
754
+ 5,
755
+ 6,
756
+ 7
757
+ ],
758
+ "modality_keys": [
759
+ "x",
760
+ "y",
761
+ "z",
762
+ "roll",
763
+ "pitch",
764
+ "yaw",
765
+ "gripper"
766
+ ],
767
+ "sin_cos_embedding_keys": null,
768
+ "mean_std_embedding_keys": [
769
+ "x",
770
+ "y",
771
+ "z",
772
+ "roll",
773
+ "pitch",
774
+ "yaw"
775
+ ],
776
+ "action_configs": [
777
+ {
778
+ "rep": "ABSOLUTE",
779
+ "type": "NON_EEF",
780
+ "format": "DEFAULT",
781
+ "state_key": null
782
+ },
783
+ {
784
+ "rep": "ABSOLUTE",
785
+ "type": "NON_EEF",
786
+ "format": "DEFAULT",
787
+ "state_key": null
788
+ },
789
+ {
790
+ "rep": "ABSOLUTE",
791
+ "type": "NON_EEF",
792
+ "format": "DEFAULT",
793
+ "state_key": null
794
+ },
795
+ {
796
+ "rep": "ABSOLUTE",
797
+ "type": "NON_EEF",
798
+ "format": "DEFAULT",
799
+ "state_key": null
800
+ },
801
+ {
802
+ "rep": "ABSOLUTE",
803
+ "type": "NON_EEF",
804
+ "format": "DEFAULT",
805
+ "state_key": null
806
+ },
807
+ {
808
+ "rep": "ABSOLUTE",
809
+ "type": "NON_EEF",
810
+ "format": "DEFAULT",
811
+ "state_key": null
812
+ },
813
+ {
814
+ "rep": "ABSOLUTE",
815
+ "type": "NON_EEF",
816
+ "format": "DEFAULT",
817
+ "state_key": null
818
+ }
819
+ ]
820
+ },
821
+ "language": {
822
+ "delta_indices": [
823
+ 0
824
+ ],
825
+ "modality_keys": [
826
+ "annotation.human.action.task_description"
827
+ ],
828
+ "sin_cos_embedding_keys": null,
829
+ "mean_std_embedding_keys": null,
830
+ "action_configs": null
831
+ }
832
+ }
833
+ },
834
+ "image_crop_size": null,
835
+ "image_target_size": null,
836
+ "use_albumentations": true,
837
+ "random_rotation_angle": null,
838
+ "color_jitter_params": {
839
+ "brightness": 0.3,
840
+ "contrast": 0.4,
841
+ "saturation": 0.5,
842
+ "hue": 0.08
843
+ },
844
+ "shortest_image_edge": 256,
845
+ "crop_fraction": 0.95,
846
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
847
+ "model_type": "eagle",
848
+ "formalize_language": true,
849
+ "max_state_dim": 128,
850
+ "max_action_dim": 128,
851
+ "max_action_horizon": 50,
852
+ "use_percentiles": false,
853
+ "clip_outliers": true,
854
+ "apply_sincos_state_encoding": true,
855
+ "use_relative_action": true
856
+ }
857
+ }
processor/statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b79c4dd31bacb974d13835e6daead4b8fed3e76becb4c5e306d20af9f1f9ae78
3
+ size 5777
wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d6", "run_id": "stage_1_2"}