mingxinz commited on Jan 28

Commit

7ab4492

verified ·

1 Parent(s): 833b08c

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

config.json +70 -0
embodiment_id.json +10 -0
experiment_cfg/conf.yaml +234 -0
experiment_cfg/config.yaml +271 -0
experiment_cfg/dataset_statistics.json +541 -0
experiment_cfg/final_model_config.json +53 -0
experiment_cfg/final_processor_config.json +0 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +0 -0
processor_config.json +485 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
statistics.json +0 -0
trainer_state.json +0 -0
training_args.bin +3 -0
wandb_config.json +1 -0

config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "action_horizon": 50,
+  "add_pos_embed": true,
+  "apply_sincos_state_encoding": true,
+  "architectures": [
+    "Gr00tN1d6"
+  ],
+  "attn_dropout": 0.2,
+  "attn_implementation": null,
+  "backbone_embedding_dim": 2048,
+  "backbone_model_type": "eagle",
+  "backbone_trainable_params_fp32": true,
+  "collator_overwrite_image_inputs": false,
+  "color_jitter_params": {
+    "brightness": 0.1,
+    "contrast": 0.1,
+    "hue": 0.1,
+    "saturation": 0.1
+  },
+  "crop_fraction": 0.95,
+  "diffusion_model_cfg": {
+    "attention_head_dim": 48,
+    "dropout": 0.2,
+    "final_dropout": true,
+    "interleave_self_attention": true,
+    "norm_type": "ada_norm",
+    "num_attention_heads": 32,
+    "num_layers": 32,
+    "output_dim": 1024,
+    "positional_embeddings": null
+  },
+  "eagle_collator": true,
+  "formalize_language": true,
+  "gemma_collator": false,
+  "hidden_size": 1024,
+  "image_crop_size": null,
+  "image_target_size": null,
+  "input_embedding_dim": 1536,
+  "load_bf16": true,
+  "max_action_dim": 128,
+  "max_num_embodiments": 32,
+  "max_seq_len": 1024,
+  "max_state_dim": 128,
+  "model_dtype": "bfloat16",
+  "model_name": "nvidia/Eagle-Block2A-2B-v2",
+  "model_type": "Gr00tN1d6",
+  "noise_beta_alpha": 1.5,
+  "noise_beta_beta": 1.0,
+  "noise_s": 0.999,
+  "num_inference_timesteps": 4,
+  "num_timestep_buckets": 1000,
+  "random_rotation_angle": null,
+  "reproject_vision": false,
+  "select_layer": 16,
+  "shortest_image_edge": 256,
+  "state_dropout_prob": 0.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "tune_diffusion_model": true,
+  "tune_llm": false,
+  "tune_projector": true,
+  "tune_top_llm_layers": 4,
+  "tune_visual": true,
+  "tune_vlln": true,
+  "use_albumentations_transforms": true,
+  "use_alternate_vl_dit": true,
+  "use_flash_attention": true,
+  "use_relative_action": true,
+  "use_vlln": true
+}

embodiment_id.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "robocasa_panda_omron": 13,
+  "gr1": 20,
+  "behavior_r1_pro": 24,
+  "unitree_g1": 8,
+  "oxe_google": 0,
+  "oxe_widowx": 1,
+  "libero_panda": 2,
+  "new_embodiment": 10
+}

experiment_cfg/conf.yaml ADDED Viewed

	@@ -0,0 +1,234 @@

+load_config_path: null
+model:
+  model_type: Gr00tN1d6
+  model_dtype: bfloat16
+  model_name: nvidia/Eagle-Block2A-2B-v2
+  backbone_model_type: eagle
+  model_revision: null
+  tune_top_llm_layers: 4
+  backbone_embedding_dim: 2048
+  tune_llm: false
+  tune_visual: true
+  select_layer: 16
+  reproject_vision: false
+  use_flash_attention: true
+  load_bf16: false
+  collator_overwrite_image_inputs: false
+  eagle_collator: true
+  backbone_trainable_params_fp32: true
+  image_crop_size: null
+  image_target_size: null
+  shortest_image_edge: 256
+  crop_fraction: 0.95
+  random_rotation_angle: null
+  color_jitter_params:
+    brightness: 0.3
+    contrast: 0.4
+    saturation: 0.5
+    hue: 0.08
+  use_albumentations_transforms: true
+  formalize_language: true
+  apply_sincos_state_encoding: false
+  use_relative_action: true
+  max_state_dim: 29
+  max_action_dim: 29
+  action_horizon: 16
+  hidden_size: 1024
+  input_embedding_dim: 1536
+  add_pos_embed: true
+  attn_dropout: 0.2
+  use_vlln: true
+  max_seq_len: 1024
+  use_alternate_vl_dit: true
+  attend_text_every_n_blocks: 2
+  diffusion_model_cfg:
+    positional_embeddings: null
+    num_layers: 32
+    num_attention_heads: 32
+    attention_head_dim: 48
+    norm_type: ada_norm
+    dropout: 0.2
+    final_dropout: true
+    output_dim: 1024
+    interleave_self_attention: true
+  num_inference_timesteps: 4
+  noise_beta_alpha: 1.5
+  noise_beta_beta: 1.0
+  noise_s: 0.999
+  num_timestep_buckets: 1000
+  tune_projector: true
+  tune_diffusion_model: true
+  tune_vlln: true
+  state_dropout_prob: 0.0
+  state_additive_noise_scale: 0.0
+  max_num_embodiments: 32
+data:
+  datasets:
+  - dataset_paths:
+    - /datasets/orca-sim-pick-and-place-mimic/stage1_3_cosmos/lerobot
+    - /datasets/orca-sim-pick-and-place-mimic/stage1_5_cosmos/lerobot
+    - /datasets/orca-sim-pick-and-place-mimic/stage1_7_cosmos/lerobot
+    - /datasets/orca-sim-pick-and-place-mimic/stage1_8_cosmos/lerobot
+    embodiment_tag: new_embodiment
+    mix_ratio: 1.0
+    dataset_type: physical_embodiment
+    val_dataset_path: null
+  modality_configs:
+    new_embodiment:
+      video:
+        delta_indices:
+        - 0
+        modality_keys:
+        - ego_view
+        sin_cos_embedding_keys: null
+        mean_std_embedding_keys: null
+        action_configs: null
+      state:
+        delta_indices:
+        - 0
+        modality_keys:
+        - left_arm
+        - right_arm
+        - left_hand
+        - right_hand
+        - waist
+        sin_cos_embedding_keys: null
+        mean_std_embedding_keys: null
+        action_configs: null
+      action:
+        delta_indices:
+        - 0
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 6
+        - 7
+        - 8
+        - 9
+        - 10
+        - 11
+        - 12
+        - 13
+        - 14
+        - 15
+        modality_keys:
+        - left_arm
+        - right_arm
+        - left_hand
+        - right_hand
+        - base_height_command
+        - navigate_command
+        sin_cos_embedding_keys: null
+        mean_std_embedding_keys: null
+        action_configs:
+        - rep: ABSOLUTE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+        - rep: ABSOLUTE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+        - rep: ABSOLUTE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+        - rep: ABSOLUTE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+        - rep: ABSOLUTE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+        - rep: ABSOLUTE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+      language:
+        delta_indices:
+        - 0
+        modality_keys:
+        - annotation.human.task_description
+        sin_cos_embedding_keys: null
+        mean_std_embedding_keys: null
+        action_configs: null
+  download_cache: false
+  shard_size: 1024
+  episode_sampling_rate: 0.1
+  num_shards_per_epoch: 100000
+  override_pretraining_statistics: false
+  mode: single_turn
+  random_chop: 0.0
+  mock_dataset_mode: false
+  shuffle: true
+  seed: 42
+  multiprocessing_context: fork
+  allow_padding: false
+  subsample_ratio: 1.0
+  image_crop_size:
+  - 244
+  - 244
+  image_target_size:
+  - 224
+  - 224
+  video_backend: torchcodec
+training:
+  output_dir: /models/ORCA-GROOT-N1.6-Sim-Pick-Place
+  experiment_name: null
+  max_steps: 100000
+  global_batch_size: 32
+  batch_size: null
+  gradient_accumulation_steps: 1
+  learning_rate: 0.0001
+  lr_scheduler_type: cosine
+  weight_decay: 1.0e-05
+  warmup_ratio: 0.05
+  warmup_steps: 0
+  max_grad_norm: 1.0
+  optim: adamw_torch
+  start_from_checkpoint: nvidia/GR00T-N1.6-3B
+  tf32: true
+  fp16: false
+  bf16: true
+  eval_bf16: true
+  logging_steps: 10
+  save_steps: 10000
+  save_total_limit: 9
+  save_vl_model: false
+  upload_checkpoints: false
+  upload_every: 1000
+  upload_last_n_checkpoints: 5
+  max_concurrent_uploads: 2
+  eval_strategy: 'no'
+  eval_steps: 500
+  eval_set_split_ratio: 0.1
+  eval_batch_size: 2
+  save_best_eval_metric_name: ''
+  save_best_eval_metric_greater_is_better: true
+  deepspeed_stage: 2
+  gradient_checkpointing: false
+  transformers_trust_remote_code: true
+  transformers_local_files_only: false
+  transformers_cache_dir: null
+  transformers_access_token: null
+  use_ddp: false
+  ddp_bucket_cap_mb: 100
+  num_gpus: 1
+  dataloader_num_workers: 8
+  remove_unused_columns: false
+  use_wandb: true
+  wandb_project: finetune-gr00t-n1d6
+  enable_profiling: false
+  max_retries: 3
+  assert_loss_less_than: null
+  add_rl_callback: false
+  enable_open_loop_eval: false
+  open_loop_eval_traj_ids:
+  - 0
+  open_loop_eval_steps_per_traj: 100
+  open_loop_eval_plot_indices: null
+max_steps: 100000
+save_steps: 10000

experiment_cfg/config.yaml ADDED Viewed

	@@ -0,0 +1,271 @@

+!!python/object:gr00t.configs.base_config.Config
+data: !!python/object:gr00t.configs.data.data_config.DataConfig
+  allow_padding: false
+  datasets:
+  - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
+    dataset_paths:
+    - /datasets/orca-sim-pick-and-place-mimic/stage1_3_cosmos/lerobot
+    - /datasets/orca-sim-pick-and-place-mimic/stage1_5_cosmos/lerobot
+    - /datasets/orca-sim-pick-and-place-mimic/stage1_7_cosmos/lerobot
+    - /datasets/orca-sim-pick-and-place-mimic/stage1_8_cosmos/lerobot
+    dataset_type: physical_embodiment
+    embodiment_tag: new_embodiment
+    mix_ratio: 1.0
+    val_dataset_path: null
+  download_cache: false
+  episode_sampling_rate: 0.1
+  image_crop_size:
+  - 244
+  - 244
+  image_target_size:
+  - 224
+  - 224
+  mock_dataset_mode: false
+  modality_configs:
+    new_embodiment:
+      action: !!python/object:gr00t.data.types.ModalityConfig
+        action_configs:
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
+          - default
+          rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
+          - absolute
+          state_key: null
+          type: &id003 !!python/object/apply:gr00t.data.types.ActionType
+          - non_eef
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: *id001
+          rep: *id002
+          state_key: null
+          type: *id003
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: *id001
+          rep: *id002
+          state_key: null
+          type: *id003
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: *id001
+          rep: *id002
+          state_key: null
+          type: *id003
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: *id001
+          rep: *id002
+          state_key: null
+          type: *id003
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: *id001
+          rep: *id002
+          state_key: null
+          type: *id003
+        delta_indices:
+        - 0
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 6
+        - 7
+        - 8
+        - 9
+        - 10
+        - 11
+        - 12
+        - 13
+        - 14
+        - 15
+        mean_std_embedding_keys: null
+        modality_keys:
+        - left_arm
+        - right_arm
+        - left_hand
+        - right_hand
+        - base_height_command
+        - navigate_command
+        sin_cos_embedding_keys: null
+      language: !!python/object:gr00t.data.types.ModalityConfig
+        action_configs: null
+        delta_indices:
+        - 0
+        mean_std_embedding_keys: null
+        modality_keys:
+        - annotation.human.task_description
+        sin_cos_embedding_keys: null
+      state: !!python/object:gr00t.data.types.ModalityConfig
+        action_configs: null
+        delta_indices:
+        - 0
+        mean_std_embedding_keys: null
+        modality_keys:
+        - left_arm
+        - right_arm
+        - left_hand
+        - right_hand
+        - waist
+        sin_cos_embedding_keys: null
+      video: !!python/object:gr00t.data.types.ModalityConfig
+        action_configs: null
+        delta_indices:
+        - 0
+        mean_std_embedding_keys: null
+        modality_keys:
+        - ego_view
+        sin_cos_embedding_keys: null
+  mode: single_turn
+  multiprocessing_context: fork
+  num_shards_per_epoch: 100000
+  override_pretraining_statistics: false
+  random_chop: 0.0
+  seed: 42
+  shard_size: 1024
+  shuffle: true
+  subsample_ratio: 1.0
+  video_backend: torchcodec
+load_config_path: null
+model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
+  _attn_implementation_autoset: false
+  _attn_implementation_internal: null
+  _commit_hash: null
+  _name_or_path: ''
+  add_cross_attention: false
+  architectures: null
+  backbone_model_type: eagle
+  backbone_trainable_params_fp32: true
+  bad_words_ids: null
+  begin_suppress_tokens: null
+  bos_token_id: null
+  chunk_size_feed_forward: 0
+  color_jitter_params:
+    brightness: 0.3
+    contrast: 0.4
+    hue: 0.08
+    saturation: 0.5
+  cross_attention_hidden_size: null
+  decoder_start_token_id: null
+  diffusion_model_cfg:
+    attention_head_dim: 48
+    dropout: 0.2
+    final_dropout: true
+    interleave_self_attention: true
+    norm_type: ada_norm
+    num_attention_heads: 32
+    num_layers: 32
+    output_dim: 1024
+    positional_embeddings: null
+  diversity_penalty: 0.0
+  do_sample: false
+  eagle_collator: true
+  early_stopping: false
+  encoder_no_repeat_ngram_size: 0
+  eos_token_id: null
+  exponential_decay_length_penalty: null
+  finetuning_task: null
+  forced_bos_token_id: null
+  forced_eos_token_id: null
+  id2label:
+    0: LABEL_0
+    1: LABEL_1
+  is_decoder: false
+  is_encoder_decoder: false
+  label2id:
+    LABEL_0: 0
+    LABEL_1: 1
+  length_penalty: 1.0
+  load_bf16: false
+  max_length: 20
+  min_length: 0
+  model_name: nvidia/Eagle-Block2A-2B-v2
+  no_repeat_ngram_size: 0
+  num_beam_groups: 1
+  num_beams: 1
+  num_return_sequences: 1
+  output_attentions: false
+  output_hidden_states: false
+  output_scores: false
+  pad_token_id: null
+  prefix: null
+  problem_type: null
+  pruned_heads: {}
+  random_rotation_angle: null
+  remove_invalid_values: false
+  repetition_penalty: 1.0
+  reproject_vision: false
+  return_dict: true
+  return_dict_in_generate: false
+  sep_token_id: null
+  state_dropout_prob: 0.0
+  suppress_tokens: null
+  task_specific_params: null
+  temperature: 1.0
+  tf_legacy_loss: false
+  tie_encoder_decoder: false
+  tie_word_embeddings: true
+  tokenizer_class: null
+  top_k: 50
+  top_p: 1.0
+  torch_dtype: null
+  torchscript: false
+  transformers_version: null
+  tune_diffusion_model: true
+  tune_llm: false
+  tune_projector: true
+  tune_visual: true
+  typical_p: 1.0
+  use_bfloat16: false
+  use_relative_action: true
+training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
+  add_rl_callback: false
+  assert_loss_less_than: null
+  batch_size: null
+  bf16: true
+  dataloader_num_workers: 8
+  ddp_bucket_cap_mb: 100
+  deepspeed_stage: 2
+  enable_open_loop_eval: false
+  enable_profiling: false
+  eval_batch_size: 2
+  eval_bf16: true
+  eval_set_split_ratio: 0.1
+  eval_steps: 500
+  eval_strategy: 'no'
+  experiment_name: null
+  fp16: false
+  global_batch_size: 32
+  gradient_accumulation_steps: 1
+  gradient_checkpointing: false
+  learning_rate: 0.0001
+  logging_steps: 10
+  lr_scheduler_type: cosine
+  max_concurrent_uploads: 2
+  max_grad_norm: 1.0
+  max_retries: 3
+  max_steps: 100000
+  num_gpus: 1
+  open_loop_eval_plot_indices: null
+  open_loop_eval_steps_per_traj: 100
+  open_loop_eval_traj_ids:
+  - 0
+  optim: adamw_torch
+  output_dir: /models/ORCA-GROOT-N1.6-Sim-Pick-Place
+  remove_unused_columns: false
+  save_best_eval_metric_greater_is_better: true
+  save_best_eval_metric_name: ''
+  save_steps: 10000
+  save_total_limit: 9
+  save_vl_model: false
+  start_from_checkpoint: nvidia/GR00T-N1.6-3B
+  tf32: true
+  transformers_access_token: null
+  transformers_cache_dir: null
+  transformers_local_files_only: false
+  transformers_trust_remote_code: true
+  upload_checkpoints: false
+  upload_every: 1000
+  upload_last_n_checkpoints: 5
+  use_ddp: false
+  use_wandb: true
+  wandb_project: finetune-gr00t-n1d6
+  warmup_ratio: 0.05
+  warmup_steps: 0
+  weight_decay: 1.0e-05

experiment_cfg/dataset_statistics.json ADDED Viewed

	@@ -0,0 +1,541 @@

+{
+  "new_embodiment": {
+    "state": {
+      "left_arm": {
+        "min": [
+          -1.6188877820968628,
+          -0.8244653344154358,
+          -0.6688740253448486,
+          -0.9937067627906799,
+          -1.8114137649536133,
+          -1.1228705644607544,
+          -1.6144299507141113
+        ],
+        "max": [
+          0.5598819255828857,
+          1.3845295906066895,
+          1.3240509033203125,
+          1.3527957201004028,
+          1.7814620733261108,
+          1.6144317388534546,
+          1.1065175533294678
+        ],
+        "mean": [
+          -0.37701161097217084,
+          0.622215387442254,
+          0.37503467529814766,
+          0.15783503666421436,
+          0.2399788372564,
+          1.1200109759703378,
+          -0.9174301245354659
+        ],
+        "std": [
+          0.4946083775214465,
+          0.29932238916855064,
+          0.32772549348322094,
+          0.48449540532395885,
+          0.40544754602526073,
+          0.3511242435683432,
+          0.41043898365089426
+        ],
+        "q01": [
+          -1.4819707024097442,
+          -0.14474324360489793,
+          -0.574706056714058,
+          -0.8940612733364105,
+          -1.4438544452190398,
+          0.14948371022939683,
+          -1.6144284009933472
+        ],
+        "q99": [
+          0.19417367294430646,
+          1.3513104689121245,
+          1.0247645223140713,
+          1.2698133671283716,
+          1.2623580229282378,
+          1.6144297122955322,
+          0.6261248129606154
+        ]
+      },
+      "right_arm": {
+        "min": [
+          -1.5973707437515259,
+          -1.4186440706253052,
+          -1.3715554475784302,
+          -0.9967253804206848,
+          -1.9683640003204346,
+          -1.2039880752563477,
+          -0.7873280644416809
+        ],
+        "max": [
+          0.6645666360855103,
+          0.7108421325683594,
+          0.8019299507141113,
+          1.791968584060669,
+          1.6332509517669678,
+          1.6144312620162964,
+          1.6144299507141113
+        ],
+        "mean": [
+          -0.35405250633789215,
+          -0.6411186511942882,
+          -0.3977095291314536,
+          0.17245792957725906,
+          -0.23415605063914072,
+          1.1111519218280617,
+          0.927019808860804
+        ],
+        "std": [
+          0.48128610921576437,
+          0.3071013246943398,
+          0.3253459083547921,
+          0.4865591021694431,
+          0.40655478518574584,
+          0.35977562430478466,
+          0.39641489447232386
+        ],
+        "q01": [
+          -1.4656522178649902,
+          -1.39724303483963,
+          -1.0831108903884887,
+          -0.930193657875061,
+          -1.2693367302417755,
+          0.10924758933484555,
+          -0.5172812539339066
+        ],
+        "q99": [
+          0.20503960207104668,
+          0.16118972286581973,
+          0.5304996186494825,
+          1.3874476826190947,
+          1.3712920653820024,
+          1.6144297122955322,
+          1.611855911016464
+        ]
+      },
+      "left_hand": {
+        "min": [
+          -0.6636313796043396,
+          -1.2668558359146118,
+          -0.6931474208831787,
+          -1.238078236579895,
+          -0.15442876517772675,
+          -0.024228721857070923,
+          -1.3946487342764158e-07
+        ],
+        "max": [
+          1.4664448144685593e-07,
+          3.774755796825957e-08,
+          2.3912218239274807e-05,
+          1.4326724340207875e-05,
+          0.06299737840890884,
+          0.7382361888885498,
+          0.8579182028770447
+        ],
+        "mean": [
+          -0.32608566497335373,
+          -0.7007213049377038,
+          -0.3091359140462433,
+          -0.7064073512096278,
+          -0.0007393016489160969,
+          0.38868270547974193,
+          0.40552368424586116
+        ],
+        "std": [
+          0.27386463929352794,
+          0.5799191449185522,
+          0.25783882398027064,
+          0.5828784381585906,
+          0.010326922473384853,
+          0.32419180806582787,
+          0.33414980089677404
+        ],
+        "q01": [
+          -0.6073416697978974,
+          -1.2136946415901184,
+          -0.5961684477329254,
+          -1.2116613757610322,
+          -0.06334078542888165,
+          -0.002377869929186999,
+          5.822703361135773e-11
+        ],
+        "q99": [
+          5.486197252047003e-10,
+          2.372745946943576e-10,
+          7.51225351369729e-10,
+          2.186030552409053e-10,
+          0.03812098965048779,
+          0.7133938479423522,
+          0.7890969663858408
+        ]
+      },
+      "right_hand": {
+        "min": [
+          -1.1264450705539275e-07,
+          -4.103394246612879e-07,
+          -2.3143680664361455e-05,
+          -2.2077354515204206e-05,
+          -0.11818881332874298,
+          -0.7287072539329529,
+          -0.8129876852035522
+        ],
+        "max": [
+          0.6552737355232239,
+          1.2486000061035156,
+          0.6695961356163025,
+          1.2865338325500488,
+          0.07462365180253983,
+          0.0623926967382431,
+          1.360115504667192e-07
+        ],
+        "mean": [
+          0.32603508115604224,
+          0.7019679585829476,
+          0.305846294819914,
+          0.7061730283775077,
+          0.0009488441506462382,
+          -0.38256330225641366,
+          -0.4028953769743837
+        ],
+        "std": [
+          0.27344637434110475,
+          0.5808696397330089,
+          0.25478759263415374,
+          0.581769250960524,
+          0.009655141392575449,
+          0.32200539057458083,
+          0.33302532344362673
+        ],
+        "q01": [
+          -7.135985397033196e-10,
+          -2.377115057572432e-10,
+          -8.369249776540855e-10,
+          -1.8977105553652506e-10,
+          -0.04443687982857227,
+          -0.7134130877256394,
+          -0.7536908400058746
+        ],
+        "q99": [
+          0.631313579082489,
+          1.2208076870441436,
+          0.5956605392694473,
+          1.2157493793964385,
+          0.05435568977147335,
+          0.002578642389271219,
+          0.0
+        ]
+      },
+      "waist": {
+        "min": [
+          -0.08484945446252823,
+          -0.17682865262031555,
+          -0.04478275775909424
+        ],
+        "max": [
+          0.06575624644756317,
+          0.15222139656543732,
+          0.21880359947681427
+        ],
+        "mean": [
+          0.0006298807157444096,
+          0.004861228026153632,
+          0.07411838826162137
+        ],
+        "std": [
+          0.01696499541536457,
+          0.054129893445569476,
+          0.021960865412836504
+        ],
+        "q01": [
+          -0.03211269959807396,
+          -0.10020484030246735,
+          0.019505513962358237
+        ],
+        "q99": [
+          0.03720596775412556,
+          0.12259715944528575,
+          0.16097534537315367
+        ]
+      }
+    },
+    "action": {
+      "left_arm": {
+        "min": [
+          -1.7263685464859009,
+          -0.8693848848342896,
+          -0.6196113228797913,
+          -1.047199010848999,
+          -1.961471676826477,
+          -1.1776831150054932,
+          -1.6144285202026367
+        ],
+        "max": [
+          0.5359810590744019,
+          1.4409669637680054,
+          1.2984442710876465,
+          1.2675622701644897,
+          1.7942465543746948,
+          1.6144285202026367,
+          0.9927496314048767
+        ],
+        "mean": [
+          -0.4278667482319257,
+          0.649818023309013,
+          0.3946276520261701,
+          0.07472400054249247,
+          0.20272922651558525,
+          1.0919044724363365,
+          -0.9606466530174609
+        ],
+        "std": [
+          0.5163644120141647,
+          0.31051259045659124,
+          0.3232428585094077,
+          0.4900628976458507,
+          0.40242868284550015,
+          0.36495802692479223,
+          0.416769561480303
+        ],
+        "q01": [
+          -1.5964591109752655,
+          -0.23531204849481582,
+          -0.5155686557292938,
+          -0.9812510508298874,
+          -1.6113185107707977,
+          0.096123421266675,
+          -1.6144285202026367
+        ],
+        "q99": [
+          0.17772121801972385,
+          1.407341595888138,
+          1.0876906502246857,
+          1.1679468894004816,
+          1.134265422821045,
+          1.6144285202026367,
+          0.5308546763658487
+        ]
+      },
+      "right_arm": {
+        "min": [
+          -1.6998568773269653,
+          -1.483291745185852,
+          -1.443288803100586,
+          -1.047199010848999,
+          -1.9721182584762573,
+          -1.2583476305007935,
+          -0.9640278816223145
+        ],
+        "max": [
+          0.6452722549438477,
+          0.7532204389572144,
+          0.6300758123397827,
+          1.707554578781128,
+          1.775823950767517,
+          1.6144285202026367,
+          1.6144285202026367
+        ],
+        "mean": [
+          -0.4050310619224775,
+          -0.6717719246220115,
+          -0.41923698447398006,
+          0.08687886090591451,
+          -0.19810921054592492,
+          1.0821888584964323,
+          0.9696818246746695
+        ],
+        "std": [
+          0.5025011695925508,
+          0.3186337523657578,
+          0.32164503326881233,
+          0.49357459211424437,
+          0.4039177868508143,
+          0.3748570877458252,
+          0.4044295671498674
+        ],
+        "q01": [
+          -1.576135537624359,
+          -1.4606408774852753,
+          -1.154829856157303,
+          -1.0447131776809693,
+          -1.162146143913269,
+          0.031073938850313426,
+          -0.4146004369854927
+        ],
+        "q99": [
+          0.1896919891238198,
+          0.1906195104122157,
+          0.47255201905965794,
+          1.3261596608161925,
+          1.5163511252403197,
+          1.6144285202026367,
+          1.6144285202026367
+        ]
+      },
+      "left_hand": {
+        "min": [
+          -0.6000000238418579,
+          -1.2000000476837158,
+          -0.6000000238418579,
+          -1.2000000476837158,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.699999988079071,
+          0.699999988079071
+        ],
+        "mean": [
+          -0.3543432927289546,
+          -0.7086865854579092,
+          -0.3543432927289546,
+          -0.7086865854579092,
+          0.0,
+          0.41355503927792936,
+          0.41355503927792936
+        ],
+        "std": [
+          0.29501940357613954,
+          0.5900388071522791,
+          0.29501940357613954,
+          0.5900388071522791,
+          0.0,
+          0.34418538597523085,
+          0.34418538597523085
+        ],
+        "q01": [
+          -0.6000000238418579,
+          -1.2000000476837158,
+          -0.6000000238418579,
+          -1.2000000476837158,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.699999988079071,
+          0.699999988079071
+        ]
+      },
+      "right_hand": {
+        "min": [
+          -0.0,
+          -0.0,
+          -0.0,
+          -0.0,
+          -0.0,
+          -0.699999988079071,
+          -0.699999988079071
+        ],
+        "max": [
+          0.6000000238418579,
+          1.2000000476837158,
+          0.6000000238418579,
+          1.2000000476837158,
+          -0.0,
+          -0.0,
+          -0.0
+        ],
+        "mean": [
+          0.3543432927289546,
+          0.7086865854579092,
+          0.3543432927289546,
+          0.7086865854579092,
+          0.0,
+          -0.41355503927792936,
+          -0.41355503927792936
+        ],
+        "std": [
+          0.29501940357613954,
+          0.5900388071522791,
+          0.29501940357613954,
+          0.5900388071522791,
+          0.0,
+          0.34418538597523085,
+          0.34418538597523085
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          -0.699999988079071,
+          -0.699999988079071
+        ],
+        "q99": [
+          0.6000000238418579,
+          1.2000000476837158,
+          0.6000000238418579,
+          1.2000000476837158,
+          -0.0,
+          -0.0,
+          -0.0
+        ]
+      },
+      "base_height_command": {
+        "min": [
+          0.75
+        ],
+        "max": [
+          0.75
+        ],
+        "mean": [
+          0.75
+        ],
+        "std": [
+          0.0
+        ],
+        "q01": [
+          0.75
+        ],
+        "q99": [
+          0.75
+        ]
+      },
+      "navigate_command": {
+        "min": [
+          -0.20000000298023224,
+          0.0,
+          -0.6000000238418579
+        ],
+        "max": [
+          0.20000000298023224,
+          0.20000000298023224,
+          0.20000000298023224
+        ],
+        "mean": [
+          0.06668563140247832,
+          0.004235940249883477,
+          -0.11014999049586177
+        ],
+        "std": [
+          0.12825434879142483,
+          0.02879522156961949,
+          0.22963595813044332
+        ],
+        "q01": [
+          -0.20000000298023224,
+          0.0,
+          -0.6000000238418579
+        ],
+        "q99": [
+          0.20000000298023224,
+          0.20000000298023224,
+          0.20000000298023224
+        ]
+      }
+    },
+    "relative_action": {}
+  }
+}

experiment_cfg/final_model_config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "model_type": "Gr00tN1d6",
+  "model_dtype": "bfloat16",
+  "model_name": "nvidia/Eagle-Block2A-2B-v2",
+  "backbone_model_type": "eagle",
+  "model_revision": null,
+  "tune_top_llm_layers": 4,
+  "backbone_embedding_dim": 2048,
+  "tune_llm": false,
+  "tune_visual": true,
+  "select_layer": 16,
+  "reproject_vision": false,
+  "use_flash_attention": true,
+  "load_bf16": true,
+  "collator_overwrite_image_inputs": false,
+  "eagle_collator": true,
+  "backbone_trainable_params_fp32": true,
+  "apply_sincos_state_encoding": true,
+  "use_relative_action": true,
+  "max_state_dim": 128,
+  "max_action_dim": 128,
+  "action_horizon": 50,
+  "hidden_size": 1024,
+  "input_embedding_dim": 1536,
+  "add_pos_embed": true,
+  "attn_dropout": 0.2,
+  "use_vlln": true,
+  "max_seq_len": 1024,
+  "use_alternate_vl_dit": true,
+  "attend_text_every_n_blocks": 2,
+  "diffusion_model_cfg": {
+    "attention_head_dim": 48,
+    "dropout": 0.2,
+    "final_dropout": true,
+    "interleave_self_attention": true,
+    "norm_type": "ada_norm",
+    "num_attention_heads": 32,
+    "num_layers": 32,
+    "output_dim": 1024,
+    "positional_embeddings": null
+  },
+  "num_inference_timesteps": 4,
+  "noise_beta_alpha": 1.5,
+  "noise_beta_beta": 1.0,
+  "noise_s": 0.999,
+  "num_timestep_buckets": 1000,
+  "tune_projector": true,
+  "tune_diffusion_model": true,
+  "tune_vlln": true,
+  "state_dropout_prob": 0.0,
+  "state_additive_noise_scale": 0.0,
+  "max_num_embodiments": 32
+}

experiment_cfg/final_processor_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf6efad6efbd23e182905c1a5526c725e275b47202e192631042bf3a6966667c
+size 4966860224

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8871ac4cee155d3d4e135f62c4d4873ec7804c7c9add671f7203a151322e913
+size 4665286304

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcdf4f7bea965b127eadeccedd1482384b4a623819f0703c6afe4b6d360e9445
+size 1063814336

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

processor_config.json ADDED Viewed

	@@ -0,0 +1,485 @@

+{
+  "processor_class": "Gr00tN1d6Processor",
+  "processor_kwargs": {
+    "modality_configs": {
+      "behavior_r1_pro": {
+        "video": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "observation.images.rgb.head_256_256",
+            "observation.images.rgb.left_wrist_256_256",
+            "observation.images.rgb.right_wrist_256_256"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "state": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "robot_pos",
+            "robot_ori_cos",
+            "robot_ori_sin",
+            "robot_2d_ori",
+            "robot_2d_ori_cos",
+            "robot_2d_ori_sin",
+            "robot_lin_vel",
+            "robot_ang_vel",
+            "arm_left_qpos",
+            "arm_left_qpos_sin",
+            "arm_left_qpos_cos",
+            "eef_left_pos",
+            "eef_left_quat",
+            "gripper_left_qpos",
+            "arm_right_qpos",
+            "arm_right_qpos_sin",
+            "arm_right_qpos_cos",
+            "eef_right_pos",
+            "eef_right_quat",
+            "gripper_right_qpos",
+            "trunk_qpos"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "action": {
+          "delta_indices": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            18,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            29,
+            30,
+            31
+          ],
+          "modality_keys": [
+            "base",
+            "torso",
+            "left_arm",
+            "left_gripper",
+            "right_arm",
+            "right_gripper"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": [
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": "trunk_qpos"
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": "arm_left_qpos"
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": "arm_right_qpos"
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            }
+          ]
+        },
+        "language": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "annotation.human.coarse_action"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        }
+      },
+      "gr1": {
+        "video": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "ego_view_bg_crop_pad_res256_freq20"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "state": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "left_arm",
+            "right_arm",
+            "left_hand",
+            "right_hand",
+            "waist"
+          ],
+          "sin_cos_embedding_keys": [
+            "left_arm",
+            "right_arm",
+            "left_hand",
+            "right_hand",
+            "waist"
+          ],
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "action": {
+          "delta_indices": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15
+          ],
+          "modality_keys": [
+            "left_arm",
+            "right_arm",
+            "left_hand",
+            "right_hand",
+            "waist"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": [
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            }
+          ]
+        },
+        "language": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "task"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        }
+      },
+      "robocasa_panda_omron": {
+        "video": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "res256_image_side_0",
+            "res256_image_side_1",
+            "res256_image_wrist_0"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "state": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "end_effector_position_relative",
+            "end_effector_rotation_relative",
+            "gripper_qpos",
+            "base_position",
+            "base_rotation"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "action": {
+          "delta_indices": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15
+          ],
+          "modality_keys": [
+            "end_effector_position",
+            "end_effector_rotation",
+            "gripper_close",
+            "base_motion",
+            "control_mode"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": [
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            }
+          ]
+        },
+        "language": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "annotation.human.action.task_description"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        }
+      },
+      "new_embodiment": {
+        "video": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "ego_view"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "state": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "left_arm",
+            "right_arm",
+            "left_hand",
+            "right_hand",
+            "waist"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "action": {
+          "delta_indices": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15
+          ],
+          "modality_keys": [
+            "left_arm",
+            "right_arm",
+            "left_hand",
+            "right_hand",
+            "base_height_command",
+            "navigate_command"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": [
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            }
+          ]
+        },
+        "language": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "annotation.human.task_description"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        }
+      }
+    },
+    "image_crop_size": null,
+    "image_target_size": null,
+    "use_albumentations": true,
+    "random_rotation_angle": null,
+    "color_jitter_params": {
+      "brightness": 0.3,
+      "contrast": 0.4,
+      "saturation": 0.5,
+      "hue": 0.08
+    },
+    "shortest_image_edge": 256,
+    "crop_fraction": 0.95,
+    "model_name": "nvidia/Eagle-Block2A-2B-v2",
+    "model_type": "eagle",
+    "formalize_language": true,
+    "max_state_dim": 128,
+    "max_action_dim": 128,
+    "max_action_horizon": 50,
+    "use_percentiles": false,
+    "clip_outliers": true,
+    "apply_sincos_state_encoding": true,
+    "use_relative_action": true
+  }
+}

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6737d5c0bb1d934959fda29ae9639c3bb8ddb9740a0b198781e7df6e43e27573
+size 14645

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69a58e80395323ef08a4ef94d63e83fe07ffedb4206d41667d189a626475829f
+size 1465

statistics.json ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:394186849149d5a727bef04d4ee20b7feaab35828bdffa3a536f4f73552a931a
+size 5777

wandb_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"project": "finetune-gr00t-n1d6", "run_id": "ORCA-GROOT-N1.6-Sim-Pick-Place"}