Upload qwen3vl-0.6B libero-object checkpoint

Browse files

Files changed (14) hide show

.gitattributes +1 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/checkpoints/step-025128-epoch-24-loss=0.0245.safetensors +3 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/config.json +519 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/config.yaml +432 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/dataset_statistics.json +104 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/gr00t_qwen3vl_0.6b_libero_object_0608_2026_06_11_02_11_30.jsonl +0 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/run-metrics.jsonl +1 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/added_tokens.json +48 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/merges.txt +0 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/special_tokens_map.json +31 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/tokenizer.json +3 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/tokenizer_config.json +401 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/vocab.json +0 -0
gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/vlm_backbone_config.json +71 -0

.gitattributes CHANGED Viewed

@@ -43,3 +43,4 @@ openvla_dino_siglip_llama2_libero_10_full_finetune_bs64/openvla_dino_siglip_llam
 gr00t_eagle_3b_robocasa_gr1_24x30_finetune_bs64/gr00t_eagle_3b_robocasa_posttrain_24x30_official_aug_2026_05_26_16_59_09.jsonl filter=lfs diff=lfs merge=lfs -text
 gr00t_qwen3vl_0.6b_libero_10_full_finetune_bs64/gr00t_qwen3vl_0.6b_libero_10_0605_2026_06_05_10_26_17.jsonl filter=lfs diff=lfs merge=lfs -text
 gr00t_qwen3vl_0.6b_libero_10_full_finetune_bs64/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 gr00t_eagle_3b_robocasa_gr1_24x30_finetune_bs64/gr00t_eagle_3b_robocasa_posttrain_24x30_official_aug_2026_05_26_16_59_09.jsonl filter=lfs diff=lfs merge=lfs -text
 gr00t_qwen3vl_0.6b_libero_10_full_finetune_bs64/gr00t_qwen3vl_0.6b_libero_10_0605_2026_06_05_10_26_17.jsonl filter=lfs diff=lfs merge=lfs -text
 gr00t_qwen3vl_0.6b_libero_10_full_finetune_bs64/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/checkpoints/step-025128-epoch-24-loss=0.0245.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aee147a607cc396ce49aad3164ec5a1197dbf4d0ed9e32577e5e652a08a83e17
+size 8850981096

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/config.json ADDED Viewed

	@@ -0,0 +1,519 @@

+{
+  "_qwen3vl_vla_ckpt": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors",
+  "_qwen3vl_vlm_config": {
+    "architectures": [
+      "Qwen3VLAForConditionalGeneration"
+    ],
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "image_token_id": 151655,
+    "model_type": "qwen3_vl",
+    "pad_token_id": 151643,
+    "pos_skipping_range": 4096,
+    "text_config": {
+      "attention_bias": false,
+      "attention_dropout": 0.0,
+      "bos_token_id": 151643,
+      "dtype": "bfloat16",
+      "eos_token_id": 151645,
+      "head_dim": 128,
+      "hidden_act": "silu",
+      "hidden_size": 1024,
+      "initializer_range": 0.02,
+      "intermediate_size": 3072,
+      "max_position_embeddings": 262144,
+      "model_type": "qwen3_vl_text",
+      "num_attention_heads": 16,
+      "num_hidden_layers": 28,
+      "num_key_value_heads": 8,
+      "pad_token_id": null,
+      "rms_norm_eps": 1e-06,
+      "rope_parameters": {
+        "mrope_interleaved": true,
+        "mrope_section": [
+          24,
+          20,
+          20
+        ],
+        "rope_theta": 5000000,
+        "rope_type": "default"
+      },
+      "tie_word_embeddings": true,
+      "use_cache": true,
+      "vocab_size": 151936
+    },
+    "tie_word_embeddings": true,
+    "use_another_LLM_path": "",
+    "use_pos_skipping": false,
+    "vision_config": {
+      "deepstack_visual_indexes": [
+        5,
+        11,
+        17
+      ],
+      "depth": 24,
+      "dtype": "bfloat16",
+      "hidden_act": "gelu_pytorch_tanh",
+      "hidden_size": 1024,
+      "in_channels": 3,
+      "initializer_range": 0.02,
+      "intermediate_size": 4096,
+      "model_type": "qwen3_vl",
+      "num_heads": 16,
+      "num_position_embeddings": 2304,
+      "out_hidden_size": 1024,
+      "patch_size": 16,
+      "spatial_merge_size": 2,
+      "temporal_patch_size": 2
+    }
+  },
+  "eval": {
+    "dataset": {
+      "transforms": [
+        {
+          "embodiment_id": 2,
+          "img_keys": [
+            "agentview_image",
+            "robot0_eye_in_hand_image"
+          ],
+          "type": "ProcessLiberoEvalInputs"
+        },
+        {
+          "type": "ConvertPILImageToNumpyArray"
+        },
+        {
+          "image_mean": [
+            0.48145466,
+            0.4578275,
+            0.40821073
+          ],
+          "image_std": [
+            0.26862954,
+            0.26130258,
+            0.27577711
+          ],
+          "img_key": "pixel_values",
+          "max_pixels": 1003520,
+          "merge_size": 2,
+          "min_pixels": 3136,
+          "patch_size": 16,
+          "temporal_patch_size": 2,
+          "to_tensor": true,
+          "type": "QWen2VLImageTransform"
+        },
+        {
+          "tokenizer": {
+            "model_path": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/",
+            "type": "PretrainedTokenizer"
+          },
+          "type": "LiberoPromptFromInputs"
+        },
+        {
+          "gripper_key": "robot0_gripper_qpos",
+          "norm_type": "mean_std",
+          "out_key": "states",
+          "pos_key": "robot0_eef_pos",
+          "quat_key": "robot0_eef_quat",
+          "state_dim": 64,
+          "type": "LiberoProprioFromInputs"
+        }
+      ],
+      "type": "LiberoParquetEvalDataset"
+    },
+    "denormalize_action": {
+      "norm_type": "mean_std",
+      "type": "DenormalizeLiberoAction"
+    },
+    "eval_chunk_size": 10,
+    "model_family": "pi0",
+    "num_steps_wait": 10,
+    "num_trials_per_task": 50,
+    "resize_size": 224,
+    "seed": 7,
+    "task_suite_name": "libero_object",
+    "type": "LiberoEvalRunner"
+  },
+  "inference_model": {
+    "freeze_projector": false,
+    "freeze_vlm_backbone": false,
+    "name_mapping": null,
+    "pretrained_name_or_path": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors",
+    "type": "LlavaVLA",
+    "vla_head": {
+      "action_dim": 32,
+      "backbone_embedding_dim": 2048,
+      "diffusion_model_cfg": {
+        "attention_head_dim": 48,
+        "cross_attention_dim": 2048,
+        "dropout": 0.2,
+        "final_dropout": true,
+        "interleave_self_attention": true,
+        "norm_type": "ada_norm",
+        "num_attention_heads": 32,
+        "num_layers": 16,
+        "output_dim": 1024,
+        "positional_embeddings": null
+      },
+      "hidden_size": 1024,
+      "input_embedding_dim": 1536,
+      "num_heads": 4,
+      "num_inference_timesteps": 4,
+      "num_layers": 1,
+      "ori_action_dim": 7,
+      "state_dim": 64,
+      "traj_length": 10,
+      "type": "FlowMatchingHead",
+      "vl_self_attention_cfg": {
+        "attention_head_dim": 64,
+        "dropout": 0.2,
+        "final_dropout": true,
+        "num_attention_heads": 32,
+        "num_layers": 4,
+        "positional_embeddings": null
+      }
+    },
+    "vlm_backbone": {
+      "attn_implementation": "sdpa",
+      "projection_output_dim": 2048,
+      "projection_type": "linear",
+      "type": "Qwen3VL",
+      "use_projection": true,
+      "vlm_backbone_id": "qwen3_0.6b_vl_pt",
+      "vlm_config": {
+        "architectures": [
+          "Qwen3VLAForConditionalGeneration"
+        ],
+        "dtype": "bfloat16",
+        "eos_token_id": 151645,
+        "image_token_id": 151655,
+        "model_type": "qwen3_vl",
+        "pad_token_id": 151643,
+        "pos_skipping_range": 4096,
+        "text_config": {
+          "attention_bias": false,
+          "attention_dropout": 0.0,
+          "bos_token_id": 151643,
+          "dtype": "bfloat16",
+          "eos_token_id": 151645,
+          "head_dim": 128,
+          "hidden_act": "silu",
+          "hidden_size": 1024,
+          "initializer_range": 0.02,
+          "intermediate_size": 3072,
+          "max_position_embeddings": 262144,
+          "model_type": "qwen3_vl_text",
+          "num_attention_heads": 16,
+          "num_hidden_layers": 28,
+          "num_key_value_heads": 8,
+          "pad_token_id": null,
+          "rms_norm_eps": 1e-06,
+          "rope_parameters": {
+            "mrope_interleaved": true,
+            "mrope_section": [
+              24,
+              20,
+              20
+            ],
+            "rope_theta": 5000000,
+            "rope_type": "default"
+          },
+          "tie_word_embeddings": true,
+          "use_cache": true,
+          "vocab_size": 151936
+        },
+        "tie_word_embeddings": true,
+        "use_another_LLM_path": "",
+        "use_pos_skipping": false,
+        "vision_config": {
+          "deepstack_visual_indexes": [
+            5,
+            11,
+            17
+          ],
+          "depth": 24,
+          "dtype": "bfloat16",
+          "hidden_act": "gelu_pytorch_tanh",
+          "hidden_size": 1024,
+          "in_channels": 3,
+          "initializer_range": 0.02,
+          "intermediate_size": 4096,
+          "model_type": "qwen3_vl",
+          "num_heads": 16,
+          "num_position_embeddings": 2304,
+          "out_hidden_size": 1024,
+          "patch_size": 16,
+          "spatial_merge_size": 2,
+          "temporal_patch_size": 2
+        }
+      },
+      "vlm_path": null
+    }
+  },
+  "model": {
+    "freeze_projector": false,
+    "freeze_vlm_backbone": false,
+    "name_mapping": null,
+    "pretrained_name_or_path": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors",
+    "strict_mapping": false,
+    "type": "LlavaVLA",
+    "vla_head": {
+      "action_dim": 32,
+      "backbone_embedding_dim": 2048,
+      "diffusion_model_cfg": {
+        "attention_head_dim": 48,
+        "cross_attention_dim": 2048,
+        "dropout": 0.2,
+        "final_dropout": true,
+        "interleave_self_attention": true,
+        "norm_type": "ada_norm",
+        "num_attention_heads": 32,
+        "num_layers": 16,
+        "output_dim": 1024,
+        "positional_embeddings": null
+      },
+      "hidden_size": 1024,
+      "input_embedding_dim": 1536,
+      "num_heads": 4,
+      "num_inference_timesteps": 4,
+      "num_layers": 1,
+      "ori_action_dim": 7,
+      "state_dim": 64,
+      "traj_length": 10,
+      "type": "FlowMatchingHead",
+      "vl_self_attention_cfg": {
+        "attention_head_dim": 64,
+        "dropout": 0.2,
+        "final_dropout": true,
+        "num_attention_heads": 32,
+        "num_layers": 4,
+        "positional_embeddings": null
+      }
+    },
+    "vlm_backbone": {
+      "attn_implementation": "sdpa",
+      "projection_output_dim": 2048,
+      "projection_type": "linear",
+      "type": "Qwen3VL",
+      "use_projection": true,
+      "vlm_backbone_id": "qwen3_0.6b_vl_pt",
+      "vlm_config": {
+        "architectures": [
+          "Qwen3VLAForConditionalGeneration"
+        ],
+        "dtype": "bfloat16",
+        "eos_token_id": 151645,
+        "image_token_id": 151655,
+        "model_type": "qwen3_vl",
+        "pad_token_id": 151643,
+        "pos_skipping_range": 4096,
+        "text_config": {
+          "attention_bias": false,
+          "attention_dropout": 0.0,
+          "bos_token_id": 151643,
+          "dtype": "bfloat16",
+          "eos_token_id": 151645,
+          "head_dim": 128,
+          "hidden_act": "silu",
+          "hidden_size": 1024,
+          "initializer_range": 0.02,
+          "intermediate_size": 3072,
+          "max_position_embeddings": 262144,
+          "model_type": "qwen3_vl_text",
+          "num_attention_heads": 16,
+          "num_hidden_layers": 28,
+          "num_key_value_heads": 8,
+          "pad_token_id": null,
+          "rms_norm_eps": 1e-06,
+          "rope_parameters": {
+            "mrope_interleaved": true,
+            "mrope_section": [
+              24,
+              20,
+              20
+            ],
+            "rope_theta": 5000000,
+            "rope_type": "default"
+          },
+          "tie_word_embeddings": true,
+          "use_cache": true,
+          "vocab_size": 151936
+        },
+        "tie_word_embeddings": true,
+        "use_another_LLM_path": "",
+        "use_pos_skipping": false,
+        "vision_config": {
+          "deepstack_visual_indexes": [
+            5,
+            11,
+            17
+          ],
+          "depth": 24,
+          "dtype": "bfloat16",
+          "hidden_act": "gelu_pytorch_tanh",
+          "hidden_size": 1024,
+          "in_channels": 3,
+          "initializer_range": 0.02,
+          "intermediate_size": 4096,
+          "model_type": "qwen3_vl",
+          "num_heads": 16,
+          "num_position_embeddings": 2304,
+          "out_hidden_size": 1024,
+          "patch_size": 16,
+          "spatial_merge_size": 2,
+          "temporal_patch_size": 2
+        }
+      },
+      "vlm_path": null
+    }
+  },
+  "runner": {
+    "change_key_name": false,
+    "collator": {
+      "keys": [
+        "states",
+        "observation.eepose",
+        "timestamp",
+        "images",
+        "img_masks",
+        "lang_tokens",
+        "lang_masks",
+        "actions",
+        "action_masks",
+        "embodiment_ids",
+        "image_grid_thw"
+      ],
+      "meta_keys": [
+        "task_description",
+        "prompt",
+        "info",
+        "stats"
+      ],
+      "type": "DictCollator"
+    },
+    "enable_gradient_checkpointing": false,
+    "enable_mixed_precision_training": true,
+    "learning_rate": 3e-05,
+    "lr_scheduler_type": "linear-warmup+cosine-decay",
+    "max_epochs": 24,
+    "max_grad_norm": 1.0,
+    "metric": {
+      "active_trackers": [
+        "jsonl",
+        "wandb"
+      ],
+      "grad_accumulation_steps": 1,
+      "run_dir": "work_dirs",
+      "type": "VLAMetric",
+      "window_size": 1
+    },
+    "mixed_precision_dtype": "bf16",
+    "sampler": null,
+    "sharding_strategy": "full-shard",
+    "tokenizer": {
+      "model_path": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/",
+      "type": "PretrainedTokenizer"
+    },
+    "type": "FSDPTrainRunner",
+    "warmup_ratio": 0.03,
+    "weight_decay": 0.0
+  },
+  "train_dataloader": {
+    "dataset": {
+      "datasets": {
+        "action_key": "action",
+        "action_window_size": 10,
+        "data_root_path": [
+          "/limx/tos/limx_mani_data/raw_data/LIBERO_lerobot/libero_object_no_noops_lerobotv2.1"
+        ],
+        "statistic_name": "libero_object_no_noops",
+        "transforms": [
+          {
+            "embodiment_id": 2,
+            "name_mappings": {
+              "actions": [
+                "actions"
+              ],
+              "observation.state": [
+                "states"
+              ]
+            },
+            "parquet_keys": [
+              "observation.state",
+              "timestamp",
+              "actions",
+              "info",
+              "stats",
+              "action_masks"
+            ],
+            "type": "ProcessParquetInputs",
+            "video_keys": [
+              "observation.images.image",
+              "observation.images.wrist_image"
+            ]
+          },
+          {
+            "type": "ParquetPrompter"
+          },
+          {
+            "tokenizer": {
+              "model_path": "/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/",
+              "type": "PretrainedTokenizer"
+            },
+            "type": "ProcessPrompts"
+          },
+          {
+            "height": 224,
+            "type": "ResizeImages",
+            "width": 224
+          },
+          {
+            "image_mean": [
+              0.48145466,
+              0.4578275,
+              0.40821073
+            ],
+            "image_std": [
+              0.26862954,
+              0.26130258,
+              0.27577711
+            ],
+            "max_pixels": 1003520,
+            "merge_size": 2,
+            "min_pixels": 3136,
+            "patch_size": 16,
+            "temporal_patch_size": 2,
+            "type": "QWen2VLImageTransform"
+          },
+          {
+            "action_dim": 32,
+            "action_key": "action",
+            "norm_type": "mean_std",
+            "state_dim": 64,
+            "state_key": "proprio",
+            "type": "NormalizeStatesAndActions"
+          }
+        ],
+        "type": "ParquetDataset",
+        "use_delta": false,
+        "window_start_idx": 0
+      },
+      "name_mappings": {
+        "action": [
+          "action"
+        ],
+        "observation.state": [
+          "proprio"
+        ]
+      },
+      "statistic_keys": [
+        "observation.state",
+        "timestamp",
+        "action"
+      ],
+      "statistic_name": "libero_object_no_noops",
+      "type": "DistributedRepeatingDataset"
+    },
+    "per_device_batch_size": 8,
+    "per_device_num_workers": 4
+  }
+}

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/config.yaml ADDED Viewed

	@@ -0,0 +1,432 @@

+_qwen3vl_vla_ckpt: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors
+_qwen3vl_vlm_config:
+  architectures:
+  - Qwen3VLAForConditionalGeneration
+  dtype: bfloat16
+  eos_token_id: 151645
+  image_token_id: 151655
+  model_type: qwen3_vl
+  pad_token_id: 151643
+  pos_skipping_range: 4096
+  text_config:
+    attention_bias: false
+    attention_dropout: 0.0
+    bos_token_id: 151643
+    dtype: bfloat16
+    eos_token_id: 151645
+    head_dim: 128
+    hidden_act: silu
+    hidden_size: 1024
+    initializer_range: 0.02
+    intermediate_size: 3072
+    max_position_embeddings: 262144
+    model_type: qwen3_vl_text
+    num_attention_heads: 16
+    num_hidden_layers: 28
+    num_key_value_heads: 8
+    pad_token_id: null
+    rms_norm_eps: 1.0e-06
+    rope_parameters:
+      mrope_interleaved: true
+      mrope_section:
+      - 24
+      - 20
+      - 20
+      rope_theta: 5000000
+      rope_type: default
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 151936
+  tie_word_embeddings: true
+  use_another_LLM_path: ''
+  use_pos_skipping: false
+  vision_config:
+    deepstack_visual_indexes:
+    - 5
+    - 11
+    - 17
+    depth: 24
+    dtype: bfloat16
+    hidden_act: gelu_pytorch_tanh
+    hidden_size: 1024
+    in_channels: 3
+    initializer_range: 0.02
+    intermediate_size: 4096
+    model_type: qwen3_vl
+    num_heads: 16
+    num_position_embeddings: 2304
+    out_hidden_size: 1024
+    patch_size: 16
+    spatial_merge_size: 2
+    temporal_patch_size: 2
+eval:
+  dataset:
+    transforms:
+    - embodiment_id: 2
+      img_keys:
+      - agentview_image
+      - robot0_eye_in_hand_image
+      type: ProcessLiberoEvalInputs
+    - type: ConvertPILImageToNumpyArray
+    - image_mean:
+      - 0.48145466
+      - 0.4578275
+      - 0.40821073
+      image_std:
+      - 0.26862954
+      - 0.26130258
+      - 0.27577711
+      img_key: pixel_values
+      max_pixels: 1003520
+      merge_size: 2
+      min_pixels: 3136
+      patch_size: 16
+      temporal_patch_size: 2
+      to_tensor: true
+      type: QWen2VLImageTransform
+    - tokenizer:
+        model_path: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/
+        type: PretrainedTokenizer
+      type: LiberoPromptFromInputs
+    - gripper_key: robot0_gripper_qpos
+      norm_type: mean_std
+      out_key: states
+      pos_key: robot0_eef_pos
+      quat_key: robot0_eef_quat
+      state_dim: 64
+      type: LiberoProprioFromInputs
+    type: LiberoParquetEvalDataset
+  denormalize_action:
+    norm_type: mean_std
+    type: DenormalizeLiberoAction
+  eval_chunk_size: 10
+  model_family: pi0
+  num_steps_wait: 10
+  num_trials_per_task: 50
+  resize_size: 224
+  seed: 7
+  task_suite_name: libero_object
+  type: LiberoEvalRunner
+inference_model:
+  freeze_projector: false
+  freeze_vlm_backbone: false
+  name_mapping: null
+  pretrained_name_or_path: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors
+  type: LlavaVLA
+  vla_head:
+    action_dim: 32
+    backbone_embedding_dim: 2048
+    diffusion_model_cfg:
+      attention_head_dim: 48
+      cross_attention_dim: 2048
+      dropout: 0.2
+      final_dropout: true
+      interleave_self_attention: true
+      norm_type: ada_norm
+      num_attention_heads: 32
+      num_layers: 16
+      output_dim: 1024
+      positional_embeddings: null
+    hidden_size: 1024
+    input_embedding_dim: 1536
+    num_heads: 4
+    num_inference_timesteps: 4
+    num_layers: 1
+    ori_action_dim: 7
+    state_dim: 64
+    traj_length: 10
+    type: FlowMatchingHead
+    vl_self_attention_cfg:
+      attention_head_dim: 64
+      dropout: 0.2
+      final_dropout: true
+      num_attention_heads: 32
+      num_layers: 4
+      positional_embeddings: null
+  vlm_backbone:
+    attn_implementation: sdpa
+    projection_output_dim: 2048
+    projection_type: linear
+    type: Qwen3VL
+    use_projection: true
+    vlm_backbone_id: qwen3_0.6b_vl_pt
+    vlm_config:
+      architectures:
+      - Qwen3VLAForConditionalGeneration
+      dtype: bfloat16
+      eos_token_id: 151645
+      image_token_id: 151655
+      model_type: qwen3_vl
+      pad_token_id: 151643
+      pos_skipping_range: 4096
+      text_config:
+        attention_bias: false
+        attention_dropout: 0.0
+        bos_token_id: 151643
+        dtype: bfloat16
+        eos_token_id: 151645
+        head_dim: 128
+        hidden_act: silu
+        hidden_size: 1024
+        initializer_range: 0.02
+        intermediate_size: 3072
+        max_position_embeddings: 262144
+        model_type: qwen3_vl_text
+        num_attention_heads: 16
+        num_hidden_layers: 28
+        num_key_value_heads: 8
+        pad_token_id: null
+        rms_norm_eps: 1.0e-06
+        rope_parameters:
+          mrope_interleaved: true
+          mrope_section:
+          - 24
+          - 20
+          - 20
+          rope_theta: 5000000
+          rope_type: default
+        tie_word_embeddings: true
+        use_cache: true
+        vocab_size: 151936
+      tie_word_embeddings: true
+      use_another_LLM_path: ''
+      use_pos_skipping: false
+      vision_config:
+        deepstack_visual_indexes:
+        - 5
+        - 11
+        - 17
+        depth: 24
+        dtype: bfloat16
+        hidden_act: gelu_pytorch_tanh
+        hidden_size: 1024
+        in_channels: 3
+        initializer_range: 0.02
+        intermediate_size: 4096
+        model_type: qwen3_vl
+        num_heads: 16
+        num_position_embeddings: 2304
+        out_hidden_size: 1024
+        patch_size: 16
+        spatial_merge_size: 2
+        temporal_patch_size: 2
+    vlm_path: null
+model:
+  freeze_projector: false
+  freeze_vlm_backbone: false
+  name_mapping: null
+  pretrained_name_or_path: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors
+  strict_mapping: false
+  type: LlavaVLA
+  vla_head:
+    action_dim: 32
+    backbone_embedding_dim: 2048
+    diffusion_model_cfg:
+      attention_head_dim: 48
+      cross_attention_dim: 2048
+      dropout: 0.2
+      final_dropout: true
+      interleave_self_attention: true
+      norm_type: ada_norm
+      num_attention_heads: 32
+      num_layers: 16
+      output_dim: 1024
+      positional_embeddings: null
+    hidden_size: 1024
+    input_embedding_dim: 1536
+    num_heads: 4
+    num_inference_timesteps: 4
+    num_layers: 1
+    ori_action_dim: 7
+    state_dim: 64
+    traj_length: 10
+    type: FlowMatchingHead
+    vl_self_attention_cfg:
+      attention_head_dim: 64
+      dropout: 0.2
+      final_dropout: true
+      num_attention_heads: 32
+      num_layers: 4
+      positional_embeddings: null
+  vlm_backbone:
+    attn_implementation: sdpa
+    projection_output_dim: 2048
+    projection_type: linear
+    type: Qwen3VL
+    use_projection: true
+    vlm_backbone_id: qwen3_0.6b_vl_pt
+    vlm_config:
+      architectures:
+      - Qwen3VLAForConditionalGeneration
+      dtype: bfloat16
+      eos_token_id: 151645
+      image_token_id: 151655
+      model_type: qwen3_vl
+      pad_token_id: 151643
+      pos_skipping_range: 4096
+      text_config:
+        attention_bias: false
+        attention_dropout: 0.0
+        bos_token_id: 151643
+        dtype: bfloat16
+        eos_token_id: 151645
+        head_dim: 128
+        hidden_act: silu
+        hidden_size: 1024
+        initializer_range: 0.02
+        intermediate_size: 3072
+        max_position_embeddings: 262144
+        model_type: qwen3_vl_text
+        num_attention_heads: 16
+        num_hidden_layers: 28
+        num_key_value_heads: 8
+        pad_token_id: null
+        rms_norm_eps: 1.0e-06
+        rope_parameters:
+          mrope_interleaved: true
+          mrope_section:
+          - 24
+          - 20
+          - 20
+          rope_theta: 5000000
+          rope_type: default
+        tie_word_embeddings: true
+        use_cache: true
+        vocab_size: 151936
+      tie_word_embeddings: true
+      use_another_LLM_path: ''
+      use_pos_skipping: false
+      vision_config:
+        deepstack_visual_indexes:
+        - 5
+        - 11
+        - 17
+        depth: 24
+        dtype: bfloat16
+        hidden_act: gelu_pytorch_tanh
+        hidden_size: 1024
+        in_channels: 3
+        initializer_range: 0.02
+        intermediate_size: 4096
+        model_type: qwen3_vl
+        num_heads: 16
+        num_position_embeddings: 2304
+        out_hidden_size: 1024
+        patch_size: 16
+        spatial_merge_size: 2
+        temporal_patch_size: 2
+    vlm_path: null
+runner:
+  change_key_name: false
+  collator:
+    keys:
+    - states
+    - observation.eepose
+    - timestamp
+    - images
+    - img_masks
+    - lang_tokens
+    - lang_masks
+    - actions
+    - action_masks
+    - embodiment_ids
+    - image_grid_thw
+    meta_keys:
+    - task_description
+    - prompt
+    - info
+    - stats
+    type: DictCollator
+  enable_gradient_checkpointing: false
+  enable_mixed_precision_training: true
+  learning_rate: 3.0e-05
+  lr_scheduler_type: linear-warmup+cosine-decay
+  max_epochs: 24
+  max_grad_norm: 1.0
+  metric:
+    active_trackers:
+    - jsonl
+    - wandb
+    grad_accumulation_steps: 1
+    run_dir: work_dirs
+    type: VLAMetric
+    window_size: 1
+  mixed_precision_dtype: bf16
+  sampler: null
+  sharding_strategy: full-shard
+  tokenizer:
+    model_path: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/
+    type: PretrainedTokenizer
+  type: FSDPTrainRunner
+  warmup_ratio: 0.03
+  weight_decay: 0.0
+train_dataloader:
+  dataset:
+    datasets:
+      action_key: action
+      action_window_size: 10
+      data_root_path:
+      - /limx/tos/limx_mani_data/raw_data/LIBERO_lerobot/libero_object_no_noops_lerobotv2.1
+      statistic_name: libero_object_no_noops
+      transforms:
+      - embodiment_id: 2
+        name_mappings:
+          actions:
+          - actions
+          observation.state:
+          - states
+        parquet_keys:
+        - observation.state
+        - timestamp
+        - actions
+        - info
+        - stats
+        - action_masks
+        type: ProcessParquetInputs
+        video_keys:
+        - observation.images.image
+        - observation.images.wrist_image
+      - type: ParquetPrompter
+      - tokenizer:
+          model_path: /limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/
+          type: PretrainedTokenizer
+        type: ProcessPrompts
+      - height: 224
+        type: ResizeImages
+        width: 224
+      - image_mean:
+        - 0.48145466
+        - 0.4578275
+        - 0.40821073
+        image_std:
+        - 0.26862954
+        - 0.26130258
+        - 0.27577711
+        max_pixels: 1003520
+        merge_size: 2
+        min_pixels: 3136
+        patch_size: 16
+        temporal_patch_size: 2
+        type: QWen2VLImageTransform
+      - action_dim: 32
+        action_key: action
+        norm_type: mean_std
+        state_dim: 64
+        state_key: proprio
+        type: NormalizeStatesAndActions
+      type: ParquetDataset
+      use_delta: false
+      window_start_idx: 0
+    name_mappings:
+      action:
+      - action
+      observation.state:
+      - proprio
+    statistic_keys:
+    - observation.state
+    - timestamp
+    - action
+    statistic_name: libero_object_no_noops
+    type: DistributedRepeatingDataset
+  per_device_batch_size: 8
+  per_device_num_workers: 4

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/dataset_statistics.json ADDED Viewed

	@@ -0,0 +1,104 @@

+{
+  "libero_object_no_noops": {
+    "proprio": {
+      "mean": [
+        -0.029990377887890714,
+        -0.007947119348036638,
+        0.20293400450543442,
+        3.108609864126749,
+        -0.2140478258736818,
+        -0.11307033080181891,
+        0.02938040086729137,
+        -0.03055662046031239
+      ],
+      "std": [
+        0.023670072817660013,
+        0.06225550550101929,
+        0.027602195887468282,
+        0.030705662709939595,
+        0.11858388544011475,
+        0.0732862116780689,
+        0.0033820150919409114,
+        0.003251806898346789
+      ],
+      "min": [
+        -0.1765444278717041,
+        -0.29457300901412964,
+        0.008128180168569088,
+        2.2890501022338867,
+        -1.883241891860962,
+        -1.0600427389144897,
+        0.0006495157140307128,
+        -0.041782498359680176
+      ],
+      "max": [
+        0.14580604434013367,
+        0.33216384053230286,
+        0.3857804834842682,
+        3.4003844261169434,
+        0.7954911589622498,
+        0.6642207503318787,
+        0.04104341194033623,
+        -0.00018117300351150334
+      ],
+      "q01": null,
+      "q99": null
+    },
+    "timestamp": {
+      "mean": [
+        3.721695479517497
+      ],
+      "std": [
+        2.237081841546431
+      ],
+      "min": [
+        0.0
+      ],
+      "max": [
+        12.65
+      ],
+      "q01": null,
+      "q99": null
+    },
+    "action": {
+      "mean": [
+        0.07096490746267721,
+        0.13498889685796536,
+        -0.046013733641776924,
+        0.0012352044345171392,
+        0.006998803721298765,
+        -0.015027527802288103,
+        0.46428998075465666
+      ],
+      "std": [
+        0.10133946158044306,
+        0.165716399861371,
+        0.16914353294024564,
+        0.009240558533809633,
+        0.018657116474914717,
+        0.015913625946349673,
+        0.18849963395480163
+      ],
+      "min": [
+        -0.8839285969734192,
+        -0.9375,
+        -0.9375,
+        -0.15000000596046448,
+        -0.29035714268684387,
+        -0.32892856001853943,
+        0.0
+      ],
+      "max": [
+        0.9375,
+        0.8919642567634583,
+        0.9375,
+        0.17678570747375488,
+        0.35035714507102966,
+        0.1810714304447174,
+        1.0
+      ],
+      "q01": null,
+      "q99": null
+    }
+  }
+}

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/gr00t_qwen3vl_0.6b_libero_object_0608_2026_06_11_02_11_30.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/run-metrics.jsonl ADDED Viewed

	@@ -0,0 +1 @@

+ {"hparams": "{'_qwen3vl_vla_ckpt': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', '_qwen3vl_vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'strict_mapping': False, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_object_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': ['/limx/tos/limx_mani_data/raw_data/LIBERO_lerobot/libero_object_no_noops_lerobotv2.1'], 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_object_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 24, 'learning_rate': 3e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids', 'image_grid_thw'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {'_qwen3vl_vla_ckpt': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', '_qwen3vl_vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'strict_mapping': False, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_object_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': ['/limx/tos/limx_mani_data/raw_data/LIBERO_lerobot/libero_object_no_noops_lerobotv2.1'], 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_object_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 24, 'learning_rate': 3e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids', 'image_grid_thw'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {...}, 'run_id': 'gr00t_qwen3vl_0.6b_libero_object_0608_2026_06_11_02_11_30'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'sharding_strategy': 'full-shard', 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {...}, 'args': Namespace(config='/limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py', work_dir='/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', cfg_options={'runner.learning_rate': 3e-05, 'runner.max_epochs': 24}, eval_after_train=False, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_object', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'ConvertPILImageToNumpyArray'}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711], 'img_key': 'pixel_values', 'to_tensor': True}, {'type': 'LiberoPromptFromInputs', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'run_id': 'gr00t_qwen3vl_0.6b_libero_object_0608_2026_06_11_02_11_30'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'sharding_strategy': 'full-shard', 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {'_qwen3vl_vla_ckpt': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', '_qwen3vl_vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'strict_mapping': False, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/checkpoints/step-104160-epoch-24-loss=0.0358.safetensors', 'name_mapping': None, 'vlm_backbone': {'type': 'Qwen3VL', 'vlm_backbone_id': 'qwen3_0.6b_vl_pt', 'vlm_path': None, 'vlm_config': {'architectures': ['Qwen3VLAForConditionalGeneration'], 'dtype': 'bfloat16', 'eos_token_id': 151645, 'image_token_id': 151655, 'model_type': 'qwen3_vl', 'pad_token_id': 151643, 'pos_skipping_range': 4096, 'text_config': {'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 151643, 'dtype': 'bfloat16', 'eos_token_id': 151645, 'head_dim': 128, 'hidden_act': 'silu', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 262144, 'model_type': 'qwen3_vl_text', 'num_attention_heads': 16, 'num_hidden_layers': 28, 'num_key_value_heads': 8, 'pad_token_id': None, 'rms_norm_eps': 1e-06, 'rope_parameters': {'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_theta': 5000000, 'rope_type': 'default'}, 'tie_word_embeddings': True, 'use_cache': True, 'vocab_size': 151936}, 'tie_word_embeddings': True, 'use_another_LLM_path': '', 'use_pos_skipping': False, 'vision_config': {'deepstack_visual_indexes': [5, 11, 17], 'depth': 24, 'dtype': 'bfloat16', 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'in_channels': 3, 'initializer_range': 0.02, 'intermediate_size': 4096, 'model_type': 'qwen3_vl', 'num_heads': 16, 'num_position_embeddings': 2304, 'out_hidden_size': 1024, 'patch_size': 16, 'spatial_merge_size': 2, 'temporal_patch_size': 2}}, 'use_projection': True, 'projection_output_dim': 2048, 'projection_type': 'linear', 'attn_implementation': 'sdpa'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 2048, 'vl_self_attention_cfg': {'attention_head_dim': 64, 'num_attention_heads': 32, 'num_layers': 4, 'dropout': 0.2, 'final_dropout': True, 'positional_embeddings': None}, 'diffusion_model_cfg': {'attention_head_dim': 48, 'num_attention_heads': 32, 'cross_attention_dim': 2048, 'num_layers': 16, 'output_dim': 1024, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'positional_embeddings': None}, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'freeze_projector': False}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_object_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': ['/limx/tos/limx_mani_data/raw_data/LIBERO_lerobot/libero_object_no_noops_lerobotv2.1'], 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_object_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 24, 'learning_rate': 3e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids', 'image_grid_thw'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {...}, 'run_id': 'gr00t_qwen3vl_0.6b_libero_object_0608_2026_06_11_02_11_30'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'sharding_strategy': 'full-shard', 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: /limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py): {...}, 'args': Namespace(config='/limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py', work_dir='/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', cfg_options={'runner.learning_rate': 3e-05, 'runner.max_epochs': 24}, eval_after_train=False, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_object', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'ConvertPILImageToNumpyArray'}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711], 'img_key': 'pixel_values', 'to_tensor': True}, {'type': 'LiberoPromptFromInputs', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'args': Namespace(config='/limx/tos/users/Mayer/configs/gr00t_qwen3vl_0.6b_libero_object_0608.py', work_dir='/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_object_0608_epoch24_lr3e-5', cfg_options={'runner.learning_rate': 3e-05, 'runner.max_epochs': 24}, eval_after_train=False, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_object', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'ConvertPILImageToNumpyArray'}, {'type': 'QWen2VLImageTransform', 'min_pixels': 3136, 'max_pixels': 1003520, 'patch_size': 16, 'temporal_patch_size': 2, 'merge_size': 2, 'image_mean': [0.48145466, 0.4578275, 0.40821073], 'image_std': [0.26862954, 0.26130258, 0.27577711], 'img_key': 'pixel_values', 'to_tensor': True}, {'type': 'LiberoPromptFromInputs', 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': '/limx/tos/limx_mani_checkpoints/checkpoints/mayer_GR00T/gr00t_qwen3vl_0.6b_libero_all_0604_epoch24/tokenizer/'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}", "run_id": "gr00t_qwen3vl_0.6b_libero_object_0608_2026_06_11_02_11_30"}

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/added_tokens.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<action_end>": 151675,
+  "<action_pad>": 151673,
+  "<action_start>": 151674,
+  "<action_video>": 151669,
+  "<box2d_end>": 151679,
+  "<box2d_start>": 151678,
+  "<future_video_pad>": 151670,
+  "<future_vision_end>": 151672,
+  "<future_vision_start>": 151671,
+  "<ignore_pad>": 151688,
+  "<point2d_end>": 151681,
+  "<point2d_start>": 151680,
+  "<ref_end>": 151683,
+  "<ref_keypoint_end>": 151685,
+  "<ref_keypoint_start>": 151684,
+  "<ref_start>": 151682,
+  "<think>": 151667,
+  "<think_end>": 151677,
+  "<think_start>": 151676,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<traj2d_end>": 151687,
+  "<traj2d_start>": 151686,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b7a88bc82340dd4205b97b9c287df826cf386b31ac9ecd9e648073d940355e1
+size 11426476

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,401 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<action_video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<future_video_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<future_vision_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<future_vision_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<action_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<action_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<action_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<think_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "<think_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "<box2d_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "<box2d_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151680": {
+      "content": "<point2d_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151681": {
+      "content": "<point2d_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151682": {
+      "content": "<ref_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151683": {
+      "content": "<ref_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151684": {
+      "content": "<ref_keypoint_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151685": {
+      "content": "<ref_keypoint_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151686": {
+      "content": "<traj2d_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151687": {
+      "content": "<traj2d_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151688": {
+      "content": "<ignore_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "processor_class": "Qwen3VLAProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gr00t_qwen3vl_0.6b_libero_object_full_finetune_bs64/vlm_backbone_config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "architectures": [
+    "Qwen3VLAForConditionalGeneration"
+  ],
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "image_token_id": 151655,
+  "model_type": "qwen3_vl",
+  "pad_token_id": 151643,
+  "pos_skipping_range": 4096,
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "max_position_embeddings": 262144,
+    "model_type": "qwen3_vl_text",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 8,
+    "pad_token_id": null,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_theta": 5000000,
+      "rope_type": "default"
+    },
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "vocab_size": 151936
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.3.0",
+  "use_another_LLM_path": "",
+  "use_pos_skipping": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      5,
+      11,
+      17
+    ],
+    "depth": 24,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1024,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 1024,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652
+}