Boqian-Li commited on
Commit
cae131c
·
verified ·
1 Parent(s): f6cf5f7

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +6 -0
  2. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/config.json +113 -0
  3. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/experiment_cfg/conf.yaml +3318 -0
  4. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/experiment_cfg/metadata.json +787 -0
  5. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  6. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  7. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  8. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  9. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  10. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  11. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  12. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  13. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/mp_rank_00_model_states.pt +3 -0
  14. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/latest +1 -0
  15. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00001-of-00010.safetensors +3 -0
  16. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00002-of-00010.safetensors +3 -0
  17. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00003-of-00010.safetensors +3 -0
  18. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00004-of-00010.safetensors +3 -0
  19. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00005-of-00010.safetensors +3 -0
  20. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00006-of-00010.safetensors +3 -0
  21. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00007-of-00010.safetensors +3 -0
  22. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00008-of-00010.safetensors +3 -0
  23. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00009-of-00010.safetensors +3 -0
  24. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00010-of-00010.safetensors +3 -0
  25. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model.safetensors.index.json +0 -0
  26. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260324_0/000000_03_24_13_51_02_n1.mp4 +0 -0
  27. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260324_0/000001_03_24_13_58_27_n34.mp4 +3 -0
  28. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260326_0/000000_03_26_17_26_53_n40.mp4 +3 -0
  29. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260326_0/000001_03_26_17_31_03_n40.mp4 +3 -0
  30. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260327_0/000000_03_27_02_53_54_n50.mp4 +3 -0
  31. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260327_0/000001_03_27_02_55_16_n40.mp4 +3 -0
  32. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_0.pth +3 -0
  33. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_1.pth +3 -0
  34. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_2.pth +3 -0
  35. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_3.pth +3 -0
  36. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_4.pth +3 -0
  37. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_5.pth +3 -0
  38. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_6.pth +3 -0
  39. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_7.pth +3 -0
  40. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/scheduler.pt +3 -0
  41. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/trainer_state.json +0 -0
  42. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/wandb_config.json +1 -0
  43. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/zero_to_fp32.py +760 -0
  44. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/experiment_cfg/conf.yaml +3318 -0
  45. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/experiment_cfg/metadata.json +787 -0
  46. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/loss_log.jsonl +0 -0
  47. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/wandb/debug-internal.log +20 -0
  48. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/wandb/debug.log +24 -0
  49. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/wandb/run-20260319_110703-lsme06f2/files/config.yaml +651 -0
  50. dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/wandb/run-20260319_110703-lsme06f2/files/output.log +0 -0
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260324_0/000001_03_24_13_58_27_n34.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260326_0/000000_03_26_17_26_53_n40.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260326_0/000001_03_26_17_31_03_n40.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260327_0/000000_03_27_02_53_54_n50.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260327_0/000001_03_27_02_55_16_n40.mp4 filter=lfs diff=lfs merge=lfs -text
41
+ dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/wandb/run-20260319_110703-lsme06f2/run-lsme06f2.wandb filter=lfs diff=lfs merge=lfs -text
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/config.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 36,
3
+ "action_head_cfg": {
4
+ "_convert_": "object",
5
+ "_target_": "groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead",
6
+ "config": {
7
+ "_recursive_": false,
8
+ "_target_": "groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHeadConfig",
9
+ "action_dim": 36,
10
+ "action_horizon": 24,
11
+ "action_loss_embodiment_ids": [
12
+ 26,
13
+ 17,
14
+ 32
15
+ ],
16
+ "add_pos_embed": true,
17
+ "backbone_embedding_dim": 0,
18
+ "backbone_features_projector_cfg": null,
19
+ "decouple_video_action_noise": false,
20
+ "diffusion_model_cfg": {
21
+ "_convert_": "object",
22
+ "_target_": "groot.vla.model.dreamzero.modules.wan_video_dit_action_casual_chunk.CausalWanModel",
23
+ "action_dim": 36,
24
+ "diffusion_model_pretrained_path": "/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P",
25
+ "dim": 5120,
26
+ "eps": 1e-06,
27
+ "ffn_dim": 13824,
28
+ "frame_seqlen": 220,
29
+ "freq_dim": 256,
30
+ "in_dim": 36,
31
+ "max_chunk_size": 4,
32
+ "model_type": "i2v",
33
+ "num_action_per_block": 24,
34
+ "num_frame_per_block": 2,
35
+ "num_heads": 40,
36
+ "num_layers": 40,
37
+ "num_state_per_block": 1,
38
+ "out_dim": 16
39
+ },
40
+ "expand_batch": null,
41
+ "freeze_decode_layer": false,
42
+ "hidden_size": 64,
43
+ "image_encoder_cfg": {
44
+ "_convert_": "object",
45
+ "_target_": "groot.vla.model.dreamzero.modules.wan_video_image_encoder.WanImageEncoder",
46
+ "image_encoder_pretrained_path": "/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"
47
+ },
48
+ "init_lora_weights": "kaiming",
49
+ "input_embedding_dim": 1536,
50
+ "load_pretrained_det_decode_layer_path": null,
51
+ "lora_alpha": 4,
52
+ "lora_rank": 4,
53
+ "lora_target_modules": "q,k,v,o,ffn.0,ffn.2",
54
+ "max_action_dim": 36,
55
+ "max_state_dim": 64,
56
+ "model_dtype": "float32",
57
+ "noise_beta_alpha": 1.5,
58
+ "noise_beta_beta": 1.0,
59
+ "noise_s": 0.999,
60
+ "num_frame_per_block": 2,
61
+ "num_frames": 33,
62
+ "num_inference_timesteps": 4,
63
+ "num_timestep_buckets": 1000,
64
+ "repa_coeff": 1.0,
65
+ "repa_layer": 8,
66
+ "skip_component_loading": true,
67
+ "text_encoder_cfg": {
68
+ "_convert_": "object",
69
+ "_target_": "groot.vla.model.dreamzero.modules.wan_video_text_encoder.WanTextEncoder",
70
+ "text_encoder_pretrained_path": "/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth"
71
+ },
72
+ "tile_size_height": 34,
73
+ "tile_size_width": 34,
74
+ "tile_stride_height": 18,
75
+ "tile_stride_width": 16,
76
+ "tiled": false,
77
+ "train_architecture": "full",
78
+ "tune_diffusion_model": true,
79
+ "tune_projector": true,
80
+ "use_gradient_checkpointing": true,
81
+ "use_vlln": true,
82
+ "vae_cfg": {
83
+ "_convert_": "object",
84
+ "_target_": "groot.vla.model.dreamzero.modules.wan_video_vae.WanVideoVAE",
85
+ "vae_pretrained_path": "/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/Wan2.1_VAE.pth"
86
+ },
87
+ "video_noise_beta_alpha": 3.0,
88
+ "video_noise_beta_beta": 1.0,
89
+ "vl_self_attention_cfg": {
90
+ "_target_": "groot.vla.model.n1_5.modules.cross_attention_dit.SelfAttentionTransformer",
91
+ "attention_head_dim": 64,
92
+ "dropout": 0.2,
93
+ "final_dropout": true,
94
+ "num_attention_heads": 24,
95
+ "num_layers": 4,
96
+ "positional_embeddings": null
97
+ }
98
+ }
99
+ },
100
+ "action_horizon": 24,
101
+ "architectures": [
102
+ "VLA"
103
+ ],
104
+ "backbone_cfg": {
105
+ "_target_": "groot.vla.model.dreamzero.backbone.identity.IdentityBackbone"
106
+ },
107
+ "hidden_size": 0,
108
+ "model_dtype": "float32",
109
+ "model_type": "vla",
110
+ "resume_path": "./checkpoints/dreamzero_real_teleop_g1_full_finetune",
111
+ "torch_dtype": "bfloat16",
112
+ "transformers_version": "4.51.3"
113
+ }
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,3318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: groot.vla.model.dreamzero.base_vla.VLA
3
+ _convert_: object
4
+ config:
5
+ _target_: groot.vla.model.dreamzero.base_vla.VLAConfig
6
+ _recursive_: false
7
+ model_dtype: float32
8
+ hidden_size: 0
9
+ action_horizon: 24
10
+ action_dim: 36
11
+ backbone_cfg:
12
+ _target_: groot.vla.model.dreamzero.backbone.identity.IdentityBackbone
13
+ action_head_cfg:
14
+ config:
15
+ backbone_features_projector_cfg: null
16
+ _target_: groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHeadConfig
17
+ _recursive_: false
18
+ tiled: false
19
+ tile_size_height: 34
20
+ tile_size_width: 34
21
+ tile_stride_height: 18
22
+ tile_stride_width: 16
23
+ lora_rank: 4
24
+ lora_alpha: 4
25
+ num_frames: 33
26
+ num_frame_per_block: 2
27
+ lora_target_modules: q,k,v,o,ffn.0,ffn.2
28
+ init_lora_weights: kaiming
29
+ train_architecture: full
30
+ use_gradient_checkpointing: true
31
+ add_pos_embed: true
32
+ model_dtype: float32
33
+ max_state_dim: 64
34
+ max_action_dim: 36
35
+ action_loss_embodiment_ids:
36
+ - 26
37
+ - 17
38
+ - 32
39
+ hidden_size: 64
40
+ input_embedding_dim: 1536
41
+ backbone_embedding_dim: 0
42
+ repa_layer: 8
43
+ repa_coeff: 1.0
44
+ load_pretrained_det_decode_layer_path: null
45
+ freeze_decode_layer: false
46
+ expand_batch: null
47
+ use_vlln: true
48
+ vl_self_attention_cfg:
49
+ _target_: groot.vla.model.n1_5.modules.cross_attention_dit.SelfAttentionTransformer
50
+ positional_embeddings: null
51
+ num_layers: 4
52
+ num_attention_heads: 24
53
+ attention_head_dim: 64
54
+ dropout: 0.2
55
+ final_dropout: true
56
+ diffusion_model_cfg:
57
+ _target_: groot.vla.model.dreamzero.modules.wan_video_dit_action_casual_chunk.CausalWanModel
58
+ _convert_: object
59
+ diffusion_model_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P
60
+ model_type: i2v
61
+ frame_seqlen: 220
62
+ dim: 5120
63
+ in_dim: 36
64
+ ffn_dim: 13824
65
+ out_dim: 16
66
+ freq_dim: 256
67
+ eps: 1.0e-06
68
+ num_heads: 40
69
+ num_layers: 40
70
+ max_chunk_size: 4
71
+ num_frame_per_block: 2
72
+ num_action_per_block: 24
73
+ num_state_per_block: 1
74
+ action_dim: 36
75
+ text_encoder_cfg:
76
+ _target_: groot.vla.model.dreamzero.modules.wan_video_text_encoder.WanTextEncoder
77
+ _convert_: object
78
+ text_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth
79
+ image_encoder_cfg:
80
+ _target_: groot.vla.model.dreamzero.modules.wan_video_image_encoder.WanImageEncoder
81
+ _convert_: object
82
+ image_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth
83
+ vae_cfg:
84
+ _target_: groot.vla.model.dreamzero.modules.wan_video_vae.WanVideoVAE
85
+ _convert_: object
86
+ vae_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/Wan2.1_VAE.pth
87
+ action_dim: 36
88
+ action_horizon: 24
89
+ num_inference_timesteps: 4
90
+ noise_beta_alpha: 1.5
91
+ noise_beta_beta: 1.0
92
+ noise_s: 0.999
93
+ num_timestep_buckets: 1000
94
+ decouple_video_action_noise: false
95
+ video_noise_beta_alpha: 3.0
96
+ video_noise_beta_beta: 1.0
97
+ tune_projector: true
98
+ tune_diffusion_model: true
99
+ skip_component_loading: true
100
+ _target_: groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead
101
+ _convert_: object
102
+ train_dataset:
103
+ _target_: groot.vla.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec
104
+ _convert_: object
105
+ mixture_spec:
106
+ - dataset_path:
107
+ real_teleop_g1:
108
+ - /hfm/boqian/liboqian_data/data/real_data/gear/g1/Pick_bottle_and_turn_and_pour_into_cup
109
+ dataset_weight: 1.0
110
+ distribute_weights: true
111
+ dataset_class: groot.vla.data.dataset.lerobot_sharded.ShardedLeRobotSubLangSingleActionChunkDatasetDROID
112
+ all_modality_configs:
113
+ oxe_droid:
114
+ video:
115
+ _target_: groot.vla.data.dataset.ModalityConfig
116
+ delta_indices:
117
+ - 0
118
+ - 1
119
+ - 2
120
+ - 3
121
+ - 4
122
+ - 5
123
+ - 6
124
+ - 7
125
+ - 8
126
+ - 9
127
+ - 10
128
+ - 11
129
+ - 12
130
+ - 13
131
+ - 14
132
+ - 15
133
+ - 16
134
+ - 17
135
+ - 18
136
+ - 19
137
+ - 20
138
+ - 21
139
+ - 22
140
+ - 23
141
+ - 24
142
+ eval_delta_indices:
143
+ - 0
144
+ modality_keys:
145
+ - video.exterior_image_1_left
146
+ - video.exterior_image_2_left
147
+ - video.wrist_image_left
148
+ state:
149
+ _target_: groot.vla.data.dataset.ModalityConfig
150
+ delta_indices:
151
+ - 0
152
+ modality_keys:
153
+ - state.joint_position
154
+ - state.gripper_position
155
+ action:
156
+ _target_: groot.vla.data.dataset.ModalityConfig
157
+ delta_indices:
158
+ - 0
159
+ - 1
160
+ - 2
161
+ - 3
162
+ - 4
163
+ - 5
164
+ - 6
165
+ - 7
166
+ - 8
167
+ - 9
168
+ - 10
169
+ - 11
170
+ - 12
171
+ - 13
172
+ - 14
173
+ - 15
174
+ - 16
175
+ - 17
176
+ - 18
177
+ - 19
178
+ - 20
179
+ - 21
180
+ - 22
181
+ - 23
182
+ modality_keys:
183
+ - action.joint_position
184
+ - action.gripper_position
185
+ language:
186
+ _target_: groot.vla.data.dataset.ModalityConfig
187
+ delta_indices:
188
+ - 0
189
+ modality_keys:
190
+ - annotation.language.language_instruction
191
+ - annotation.language.language_instruction_2
192
+ - annotation.language.language_instruction_3
193
+ lapa_action:
194
+ _target_: groot.vla.data.dataset.ModalityConfig
195
+ delta_indices:
196
+ - 0
197
+ modality_keys:
198
+ - lapa_action
199
+ agibot:
200
+ video:
201
+ _target_: groot.vla.data.dataset.ModalityConfig
202
+ delta_indices:
203
+ - 0
204
+ - 1
205
+ - 2
206
+ - 3
207
+ - 4
208
+ - 5
209
+ - 6
210
+ - 7
211
+ - 8
212
+ - 9
213
+ - 10
214
+ - 11
215
+ - 12
216
+ - 13
217
+ - 14
218
+ - 15
219
+ - 16
220
+ - 17
221
+ - 18
222
+ - 19
223
+ - 20
224
+ - 21
225
+ - 22
226
+ - 23
227
+ - 24
228
+ eval_delta_indices:
229
+ - -3
230
+ - -2
231
+ - -1
232
+ - 0
233
+ modality_keys:
234
+ - video.top_head
235
+ - video.hand_left
236
+ - video.hand_right
237
+ state:
238
+ _target_: groot.vla.data.dataset.ModalityConfig
239
+ delta_indices:
240
+ - 0
241
+ modality_keys:
242
+ - state.left_arm_joint_position
243
+ - state.right_arm_joint_position
244
+ - state.left_effector_position
245
+ - state.right_effector_position
246
+ - state.head_position
247
+ - state.waist_position
248
+ action:
249
+ _target_: groot.vla.data.dataset.ModalityConfig
250
+ delta_indices:
251
+ - 0
252
+ - 1
253
+ - 2
254
+ - 3
255
+ - 4
256
+ - 5
257
+ - 6
258
+ - 7
259
+ - 8
260
+ - 9
261
+ - 10
262
+ - 11
263
+ - 12
264
+ - 13
265
+ - 14
266
+ - 15
267
+ - 16
268
+ - 17
269
+ - 18
270
+ - 19
271
+ - 20
272
+ - 21
273
+ - 22
274
+ - 23
275
+ modality_keys:
276
+ - action.left_arm_joint_position
277
+ - action.right_arm_joint_position
278
+ - action.left_effector_position
279
+ - action.right_effector_position
280
+ - action.head_position
281
+ - action.waist_position
282
+ - action.robot_velocity
283
+ language:
284
+ _target_: groot.vla.data.dataset.ModalityConfig
285
+ delta_indices:
286
+ - 0
287
+ modality_keys:
288
+ - annotation.language.action_text
289
+ yam:
290
+ video:
291
+ _target_: groot.vla.data.dataset.ModalityConfig
292
+ delta_indices:
293
+ - 0
294
+ - 1
295
+ - 2
296
+ - 3
297
+ - 4
298
+ - 5
299
+ - 6
300
+ - 7
301
+ - 8
302
+ - 9
303
+ - 10
304
+ - 11
305
+ - 12
306
+ - 13
307
+ - 14
308
+ - 15
309
+ - 16
310
+ - 17
311
+ - 18
312
+ - 19
313
+ - 20
314
+ - 21
315
+ - 22
316
+ - 23
317
+ - 24
318
+ eval_delta_indices:
319
+ - 0
320
+ modality_keys:
321
+ - video.top_camera-images-rgb
322
+ - video.left_camera-images-rgb
323
+ - video.right_camera-images-rgb
324
+ state:
325
+ _target_: groot.vla.data.dataset.ModalityConfig
326
+ delta_indices:
327
+ - 0
328
+ modality_keys:
329
+ - state.left_joint_pos
330
+ - state.left_gripper_pos
331
+ - state.right_joint_pos
332
+ - state.right_gripper_pos
333
+ action:
334
+ _target_: groot.vla.data.dataset.ModalityConfig
335
+ delta_indices:
336
+ - 0
337
+ - 1
338
+ - 2
339
+ - 3
340
+ - 4
341
+ - 5
342
+ - 6
343
+ - 7
344
+ - 8
345
+ - 9
346
+ - 10
347
+ - 11
348
+ - 12
349
+ - 13
350
+ - 14
351
+ - 15
352
+ - 16
353
+ - 17
354
+ - 18
355
+ - 19
356
+ - 20
357
+ - 21
358
+ - 22
359
+ - 23
360
+ modality_keys:
361
+ - action.left_joint_pos
362
+ - action.left_gripper_pos
363
+ - action.right_joint_pos
364
+ - action.right_gripper_pos
365
+ language:
366
+ _target_: groot.vla.data.dataset.ModalityConfig
367
+ delta_indices:
368
+ - 0
369
+ modality_keys:
370
+ - annotation.task
371
+ real_teleop_g1:
372
+ video:
373
+ _target_: groot.vla.data.dataset.ModalityConfig
374
+ delta_indices:
375
+ - 0
376
+ - 1
377
+ - 2
378
+ - 3
379
+ - 4
380
+ - 5
381
+ - 6
382
+ - 7
383
+ - 8
384
+ - 9
385
+ - 10
386
+ - 11
387
+ - 12
388
+ - 13
389
+ - 14
390
+ - 15
391
+ - 16
392
+ - 17
393
+ - 18
394
+ - 19
395
+ - 20
396
+ - 21
397
+ - 22
398
+ - 23
399
+ - 24
400
+ eval_delta_indices:
401
+ - 0
402
+ modality_keys:
403
+ - video.egocentric
404
+ state:
405
+ _target_: groot.vla.data.dataset.ModalityConfig
406
+ delta_indices:
407
+ - 0
408
+ modality_keys:
409
+ - state.left_hand
410
+ - state.right_hand
411
+ - state.left_arm
412
+ - state.right_arm
413
+ - state.rpy
414
+ - state.height
415
+ action:
416
+ _target_: groot.vla.data.dataset.ModalityConfig
417
+ delta_indices:
418
+ - 0
419
+ - 1
420
+ - 2
421
+ - 3
422
+ - 4
423
+ - 5
424
+ - 6
425
+ - 7
426
+ - 8
427
+ - 9
428
+ - 10
429
+ - 11
430
+ - 12
431
+ - 13
432
+ - 14
433
+ - 15
434
+ - 16
435
+ - 17
436
+ - 18
437
+ - 19
438
+ - 20
439
+ - 21
440
+ - 22
441
+ - 23
442
+ modality_keys:
443
+ - action.left_hand
444
+ - action.right_hand
445
+ - action.left_arm
446
+ - action.right_arm
447
+ - action.rpy
448
+ - action.height
449
+ - action.torso_vx
450
+ - action.torso_vy
451
+ - action.torso_vyaw
452
+ - action.torso_dyaw
453
+ language:
454
+ _target_: groot.vla.data.dataset.ModalityConfig
455
+ delta_indices:
456
+ - 0
457
+ modality_keys:
458
+ - annotation.language.language_instruction
459
+ all_transforms:
460
+ oxe_droid:
461
+ _target_: groot.vla.data.transform.ComposedModalityTransform
462
+ transforms:
463
+ - _target_: groot.vla.data.transform.VideoToTensor
464
+ apply_to:
465
+ - video.exterior_image_1_left
466
+ - video.exterior_image_2_left
467
+ - video.wrist_image_left
468
+ - _target_: groot.vla.data.transform.VideoCrop
469
+ apply_to:
470
+ - video.exterior_image_1_left
471
+ - video.exterior_image_2_left
472
+ - video.wrist_image_left
473
+ scale: 0.95
474
+ mode: random
475
+ - _target_: groot.vla.data.transform.VideoResize
476
+ apply_to:
477
+ - video.exterior_image_1_left
478
+ - video.exterior_image_2_left
479
+ - video.wrist_image_left
480
+ height: 176
481
+ width: 320
482
+ interpolation: linear
483
+ - _target_: groot.vla.data.transform.VideoColorJitter
484
+ apply_to:
485
+ - video.exterior_image_1_left
486
+ - video.exterior_image_2_left
487
+ - video.wrist_image_left
488
+ brightness: 0.3
489
+ contrast: 0.4
490
+ saturation: 0.5
491
+ hue: 0.08
492
+ - _target_: groot.vla.data.transform.VideoToNumpy
493
+ apply_to:
494
+ - video.exterior_image_1_left
495
+ - video.exterior_image_2_left
496
+ - video.wrist_image_left
497
+ - _target_: groot.vla.data.transform.StateActionToTensor
498
+ apply_to:
499
+ - state.joint_position
500
+ - state.gripper_position
501
+ - _target_: groot.vla.data.transform.StateActionTransform
502
+ apply_to:
503
+ - state.joint_position
504
+ - state.gripper_position
505
+ normalization_modes:
506
+ state.joint_position: q99
507
+ state.gripper_position: q99
508
+ - _target_: groot.vla.data.transform.StateActionToTensor
509
+ apply_to:
510
+ - action.joint_position
511
+ - action.gripper_position
512
+ - _target_: groot.vla.data.transform.StateActionTransform
513
+ apply_to:
514
+ - action.joint_position
515
+ - action.gripper_position
516
+ normalization_modes:
517
+ action.joint_position: q99
518
+ action.gripper_position: q99
519
+ - _target_: groot.vla.data.transform.ConcatTransform
520
+ video_concat_order:
521
+ - video.exterior_image_1_left
522
+ - video.exterior_image_2_left
523
+ - video.wrist_image_left
524
+ state_concat_order:
525
+ - state.joint_position
526
+ - state.gripper_position
527
+ action_concat_order:
528
+ - action.joint_position
529
+ - action.gripper_position
530
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
531
+ default_instruction: Perform the default behavior.
532
+ language_dropout_prob: 0.0
533
+ always_use_default_instruction: false
534
+ max_state_dim: 64
535
+ max_action_dim: 36
536
+ max_length: 512
537
+ state_horizon: 1
538
+ action_horizon: 24
539
+ embodiment_tag_mapping:
540
+ real_gr1_arms_only: 0
541
+ real_gr1_arms_only_annotated: 1
542
+ real_gr1_arms_waist: 2
543
+ real_gr1_arms_waist_annotated: 3
544
+ dexmg_gr1_arms_only_inspire: 4
545
+ dexmg_gr1_arms_only_fourier: 5
546
+ dexmg_gr1_arms_waist_fourier: 6
547
+ robocasa_single_arm: 7
548
+ onex_eve_gripper: 8
549
+ robocasa_gr1_arms_only_inspire_hands: 9
550
+ robocasa_gr1_arms_only_fourier_hands: 10
551
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
552
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
553
+ robocasa_panda_omron: 13
554
+ robocasa_bimanual_panda_parallel_gripper: 15
555
+ robocasa_bimanual_panda_inspire_hand: 16
556
+ oxe_droid: 17
557
+ oxe_fractal: 18
558
+ oxe_language_table: 19
559
+ oxe_bridge: 20
560
+ real_panda_single_arm: 21
561
+ hot3d_hands_only: 23
562
+ gr1_unified: 24
563
+ robocasa_gr1_arms_waist_fourier_hands: 25
564
+ agibot: 26
565
+ lapa: 27
566
+ oxe_mutex: 28
567
+ oxe_roboset: 29
568
+ oxe_plex: 30
569
+ dream: 31
570
+ yam: 32
571
+ xdof: 22
572
+ gr1_unified_segmentation: 14
573
+ language_table_sim: 7
574
+ gr1_isaac: 0
575
+ sim_behavior_r1_pro: 31
576
+ mecka_hands: 27
577
+ real_r1_pro_sharpa: 28
578
+ real_teleop_g1: 33
579
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
580
+ agibot:
581
+ _target_: groot.vla.data.transform.ComposedModalityTransform
582
+ transforms:
583
+ - _target_: groot.vla.data.transform.VideoToTensor
584
+ apply_to:
585
+ - video.top_head
586
+ - video.hand_left
587
+ - video.hand_right
588
+ - _target_: groot.vla.data.transform.VideoCrop
589
+ apply_to:
590
+ - video.top_head
591
+ - video.hand_left
592
+ - video.hand_right
593
+ scale: 0.95
594
+ mode: random
595
+ - _target_: groot.vla.data.transform.VideoResize
596
+ apply_to:
597
+ - video.top_head
598
+ - video.hand_left
599
+ - video.hand_right
600
+ height: 176
601
+ width: 320
602
+ interpolation: linear
603
+ - _target_: groot.vla.data.transform.VideoColorJitter
604
+ apply_to:
605
+ - video.top_head
606
+ - video.hand_left
607
+ - video.hand_right
608
+ brightness: 0.3
609
+ contrast: 0.4
610
+ saturation: 0.5
611
+ hue: 0.08
612
+ - _target_: groot.vla.data.transform.VideoToNumpy
613
+ apply_to:
614
+ - video.top_head
615
+ - video.hand_left
616
+ - video.hand_right
617
+ - _target_: groot.vla.data.transform.StateActionToTensor
618
+ apply_to:
619
+ - state.left_arm_joint_position
620
+ - state.right_arm_joint_position
621
+ - state.left_effector_position
622
+ - state.right_effector_position
623
+ - state.head_position
624
+ - state.waist_position
625
+ - _target_: groot.vla.data.transform.StateActionTransform
626
+ apply_to:
627
+ - state.left_arm_joint_position
628
+ - state.right_arm_joint_position
629
+ - state.left_effector_position
630
+ - state.right_effector_position
631
+ - state.head_position
632
+ - state.waist_position
633
+ normalization_modes:
634
+ state.left_arm_joint_position: q99
635
+ state.right_arm_joint_position: q99
636
+ state.left_effector_position: q99
637
+ state.right_effector_position: q99
638
+ state.head_position: q99
639
+ state.waist_position: q99
640
+ - _target_: groot.vla.data.transform.StateActionToTensor
641
+ apply_to:
642
+ - action.left_arm_joint_position
643
+ - action.right_arm_joint_position
644
+ - action.left_effector_position
645
+ - action.right_effector_position
646
+ - action.head_position
647
+ - action.waist_position
648
+ - action.robot_velocity
649
+ - _target_: groot.vla.data.transform.StateActionTransform
650
+ apply_to:
651
+ - action.left_arm_joint_position
652
+ - action.right_arm_joint_position
653
+ - action.left_effector_position
654
+ - action.right_effector_position
655
+ - action.head_position
656
+ - action.waist_position
657
+ - action.robot_velocity
658
+ normalization_modes:
659
+ action.left_arm_joint_position: q99
660
+ action.right_arm_joint_position: q99
661
+ action.left_effector_position: q99
662
+ action.right_effector_position: q99
663
+ action.head_position: q99
664
+ action.waist_position: q99
665
+ action.robot_velocity: q99
666
+ - _target_: groot.vla.data.transform.ConcatTransform
667
+ video_concat_order:
668
+ - video.top_head
669
+ - video.hand_left
670
+ - video.hand_right
671
+ state_concat_order:
672
+ - state.left_arm_joint_position
673
+ - state.right_arm_joint_position
674
+ - state.left_effector_position
675
+ - state.right_effector_position
676
+ - state.head_position
677
+ - state.waist_position
678
+ action_concat_order:
679
+ - action.left_arm_joint_position
680
+ - action.right_arm_joint_position
681
+ - action.left_effector_position
682
+ - action.right_effector_position
683
+ - action.head_position
684
+ - action.waist_position
685
+ - action.robot_velocity
686
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
687
+ default_instruction: Perform the default behavior.
688
+ language_dropout_prob: 0.0
689
+ always_use_default_instruction: false
690
+ max_state_dim: 64
691
+ max_action_dim: 36
692
+ max_length: 512
693
+ state_horizon: 1
694
+ action_horizon: 24
695
+ embodiment_tag_mapping:
696
+ real_gr1_arms_only: 0
697
+ real_gr1_arms_only_annotated: 1
698
+ real_gr1_arms_waist: 2
699
+ real_gr1_arms_waist_annotated: 3
700
+ dexmg_gr1_arms_only_inspire: 4
701
+ dexmg_gr1_arms_only_fourier: 5
702
+ dexmg_gr1_arms_waist_fourier: 6
703
+ robocasa_single_arm: 7
704
+ onex_eve_gripper: 8
705
+ robocasa_gr1_arms_only_inspire_hands: 9
706
+ robocasa_gr1_arms_only_fourier_hands: 10
707
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
708
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
709
+ robocasa_panda_omron: 13
710
+ robocasa_bimanual_panda_parallel_gripper: 15
711
+ robocasa_bimanual_panda_inspire_hand: 16
712
+ oxe_droid: 17
713
+ oxe_fractal: 18
714
+ oxe_language_table: 19
715
+ oxe_bridge: 20
716
+ real_panda_single_arm: 21
717
+ hot3d_hands_only: 23
718
+ gr1_unified: 24
719
+ robocasa_gr1_arms_waist_fourier_hands: 25
720
+ agibot: 26
721
+ lapa: 27
722
+ oxe_mutex: 28
723
+ oxe_roboset: 29
724
+ oxe_plex: 30
725
+ dream: 31
726
+ yam: 32
727
+ xdof: 22
728
+ gr1_unified_segmentation: 14
729
+ language_table_sim: 7
730
+ gr1_isaac: 0
731
+ sim_behavior_r1_pro: 31
732
+ mecka_hands: 27
733
+ real_r1_pro_sharpa: 28
734
+ real_teleop_g1: 33
735
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
736
+ yam:
737
+ _target_: groot.vla.data.transform.ComposedModalityTransform
738
+ transforms:
739
+ - _target_: groot.vla.data.transform.VideoToTensor
740
+ apply_to:
741
+ - video.top_camera-images-rgb
742
+ - video.left_camera-images-rgb
743
+ - video.right_camera-images-rgb
744
+ - _target_: groot.vla.data.transform.VideoCrop
745
+ apply_to:
746
+ - video.top_camera-images-rgb
747
+ - video.left_camera-images-rgb
748
+ - video.right_camera-images-rgb
749
+ scale: 0.95
750
+ mode: random
751
+ - _target_: groot.vla.data.transform.VideoResize
752
+ apply_to:
753
+ - video.top_camera-images-rgb
754
+ - video.left_camera-images-rgb
755
+ - video.right_camera-images-rgb
756
+ height: 176
757
+ width: 320
758
+ interpolation: linear
759
+ - _target_: groot.vla.data.transform.VideoColorJitter
760
+ apply_to:
761
+ - video.top_camera-images-rgb
762
+ - video.left_camera-images-rgb
763
+ - video.right_camera-images-rgb
764
+ brightness: 0.3
765
+ contrast: 0.4
766
+ saturation: 0.5
767
+ hue: 0.08
768
+ - _target_: groot.vla.data.transform.VideoToNumpy
769
+ apply_to:
770
+ - video.top_camera-images-rgb
771
+ - video.left_camera-images-rgb
772
+ - video.right_camera-images-rgb
773
+ - _target_: groot.vla.data.transform.StateActionToTensor
774
+ apply_to:
775
+ - state.left_joint_pos
776
+ - state.left_gripper_pos
777
+ - state.right_joint_pos
778
+ - state.right_gripper_pos
779
+ - _target_: groot.vla.data.transform.StateActionTransform
780
+ apply_to:
781
+ - state.left_joint_pos
782
+ - state.left_gripper_pos
783
+ - state.right_joint_pos
784
+ - state.right_gripper_pos
785
+ normalization_modes:
786
+ state.left_joint_pos: q99
787
+ state.left_gripper_pos: q99
788
+ state.right_joint_pos: q99
789
+ state.right_gripper_pos: q99
790
+ - _target_: groot.vla.data.transform.StateActionToTensor
791
+ apply_to:
792
+ - action.left_joint_pos
793
+ - action.left_gripper_pos
794
+ - action.right_joint_pos
795
+ - action.right_gripper_pos
796
+ - _target_: groot.vla.data.transform.StateActionTransform
797
+ apply_to:
798
+ - action.left_joint_pos
799
+ - action.left_gripper_pos
800
+ - action.right_joint_pos
801
+ - action.right_gripper_pos
802
+ normalization_modes:
803
+ action.left_joint_pos: q99
804
+ action.left_gripper_pos: q99
805
+ action.right_joint_pos: q99
806
+ action.right_gripper_pos: q99
807
+ - _target_: groot.vla.data.transform.ConcatTransform
808
+ video_concat_order:
809
+ - video.top_camera-images-rgb
810
+ - video.left_camera-images-rgb
811
+ - video.right_camera-images-rgb
812
+ state_concat_order:
813
+ - state.left_joint_pos
814
+ - state.left_gripper_pos
815
+ - state.right_joint_pos
816
+ - state.right_gripper_pos
817
+ action_concat_order:
818
+ - action.left_joint_pos
819
+ - action.left_gripper_pos
820
+ - action.right_joint_pos
821
+ - action.right_gripper_pos
822
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
823
+ default_instruction: Perform the default behavior.
824
+ language_dropout_prob: 0.0
825
+ always_use_default_instruction: false
826
+ max_state_dim: 64
827
+ max_action_dim: 36
828
+ max_length: 512
829
+ state_horizon: 1
830
+ action_horizon: 24
831
+ embodiment_tag_mapping:
832
+ real_gr1_arms_only: 0
833
+ real_gr1_arms_only_annotated: 1
834
+ real_gr1_arms_waist: 2
835
+ real_gr1_arms_waist_annotated: 3
836
+ dexmg_gr1_arms_only_inspire: 4
837
+ dexmg_gr1_arms_only_fourier: 5
838
+ dexmg_gr1_arms_waist_fourier: 6
839
+ robocasa_single_arm: 7
840
+ onex_eve_gripper: 8
841
+ robocasa_gr1_arms_only_inspire_hands: 9
842
+ robocasa_gr1_arms_only_fourier_hands: 10
843
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
844
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
845
+ robocasa_panda_omron: 13
846
+ robocasa_bimanual_panda_parallel_gripper: 15
847
+ robocasa_bimanual_panda_inspire_hand: 16
848
+ oxe_droid: 17
849
+ oxe_fractal: 18
850
+ oxe_language_table: 19
851
+ oxe_bridge: 20
852
+ real_panda_single_arm: 21
853
+ hot3d_hands_only: 23
854
+ gr1_unified: 24
855
+ robocasa_gr1_arms_waist_fourier_hands: 25
856
+ agibot: 26
857
+ lapa: 27
858
+ oxe_mutex: 28
859
+ oxe_roboset: 29
860
+ oxe_plex: 30
861
+ dream: 31
862
+ yam: 32
863
+ xdof: 22
864
+ gr1_unified_segmentation: 14
865
+ language_table_sim: 7
866
+ gr1_isaac: 0
867
+ sim_behavior_r1_pro: 31
868
+ mecka_hands: 27
869
+ real_r1_pro_sharpa: 28
870
+ real_teleop_g1: 33
871
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
872
+ real_teleop_g1:
873
+ _target_: groot.vla.data.transform.ComposedModalityTransform
874
+ transforms:
875
+ - _target_: groot.vla.data.transform.VideoToTensor
876
+ apply_to:
877
+ - video.egocentric
878
+ - _target_: groot.vla.data.transform.VideoCrop
879
+ apply_to:
880
+ - video.egocentric
881
+ scale: 0.95
882
+ mode: random
883
+ - _target_: groot.vla.data.transform.VideoResize
884
+ apply_to:
885
+ - video.egocentric
886
+ height: 176
887
+ width: 320
888
+ interpolation: linear
889
+ - _target_: groot.vla.data.transform.VideoColorJitter
890
+ apply_to:
891
+ - video.egocentric
892
+ brightness: 0.3
893
+ contrast: 0.4
894
+ saturation: 0.5
895
+ hue: 0.08
896
+ - _target_: groot.vla.data.transform.VideoToNumpy
897
+ apply_to:
898
+ - video.egocentric
899
+ - _target_: groot.vla.data.transform.StateActionToTensor
900
+ apply_to:
901
+ - state.left_hand
902
+ - state.right_hand
903
+ - state.left_arm
904
+ - state.right_arm
905
+ - state.rpy
906
+ - state.height
907
+ - _target_: groot.vla.data.transform.StateActionTransform
908
+ apply_to:
909
+ - state.left_hand
910
+ - state.right_hand
911
+ - state.left_arm
912
+ - state.right_arm
913
+ - state.rpy
914
+ - state.height
915
+ normalization_modes:
916
+ state.left_hand: q99
917
+ state.right_hand: q99
918
+ state.left_arm: q99
919
+ state.right_arm: q99
920
+ state.rpy: q99
921
+ state.height: q99
922
+ - _target_: groot.vla.data.transform.StateActionToTensor
923
+ apply_to:
924
+ - action.left_hand
925
+ - action.right_hand
926
+ - action.left_arm
927
+ - action.right_arm
928
+ - action.rpy
929
+ - action.height
930
+ - action.torso_vx
931
+ - action.torso_vy
932
+ - action.torso_vyaw
933
+ - action.torso_dyaw
934
+ - _target_: groot.vla.data.transform.StateActionTransform
935
+ apply_to:
936
+ - action.left_hand
937
+ - action.right_hand
938
+ - action.left_arm
939
+ - action.right_arm
940
+ - action.rpy
941
+ - action.height
942
+ - action.torso_vx
943
+ - action.torso_vy
944
+ - action.torso_vyaw
945
+ - action.torso_dyaw
946
+ normalization_modes:
947
+ action.left_hand: q99
948
+ action.right_hand: q99
949
+ action.left_arm: q99
950
+ action.right_arm: q99
951
+ action.rpy: q99
952
+ action.height: q99
953
+ action.torso_vx: q99
954
+ action.torso_vy: q99
955
+ action.torso_vyaw: q99
956
+ action.torso_dyaw: q99
957
+ - _target_: groot.vla.data.transform.ConcatTransform
958
+ video_concat_order:
959
+ - video.egocentric
960
+ state_concat_order:
961
+ - state.left_hand
962
+ - state.right_hand
963
+ - state.left_arm
964
+ - state.right_arm
965
+ - state.rpy
966
+ - state.height
967
+ action_concat_order:
968
+ - action.left_hand
969
+ - action.right_hand
970
+ - action.left_arm
971
+ - action.right_arm
972
+ - action.rpy
973
+ - action.height
974
+ - action.torso_vx
975
+ - action.torso_vy
976
+ - action.torso_vyaw
977
+ - action.torso_dyaw
978
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
979
+ default_instruction: Perform the default behavior.
980
+ language_dropout_prob: 0.0
981
+ always_use_default_instruction: false
982
+ max_state_dim: 64
983
+ max_action_dim: 36
984
+ max_length: 512
985
+ state_horizon: 1
986
+ action_horizon: 24
987
+ embodiment_tag_mapping:
988
+ real_gr1_arms_only: 0
989
+ real_gr1_arms_only_annotated: 1
990
+ real_gr1_arms_waist: 2
991
+ real_gr1_arms_waist_annotated: 3
992
+ dexmg_gr1_arms_only_inspire: 4
993
+ dexmg_gr1_arms_only_fourier: 5
994
+ dexmg_gr1_arms_waist_fourier: 6
995
+ robocasa_single_arm: 7
996
+ onex_eve_gripper: 8
997
+ robocasa_gr1_arms_only_inspire_hands: 9
998
+ robocasa_gr1_arms_only_fourier_hands: 10
999
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1000
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1001
+ robocasa_panda_omron: 13
1002
+ robocasa_bimanual_panda_parallel_gripper: 15
1003
+ robocasa_bimanual_panda_inspire_hand: 16
1004
+ oxe_droid: 17
1005
+ oxe_fractal: 18
1006
+ oxe_language_table: 19
1007
+ oxe_bridge: 20
1008
+ real_panda_single_arm: 21
1009
+ hot3d_hands_only: 23
1010
+ gr1_unified: 24
1011
+ robocasa_gr1_arms_waist_fourier_hands: 25
1012
+ agibot: 26
1013
+ lapa: 27
1014
+ oxe_mutex: 28
1015
+ oxe_roboset: 29
1016
+ oxe_plex: 30
1017
+ dream: 31
1018
+ yam: 32
1019
+ xdof: 22
1020
+ gr1_unified_segmentation: 14
1021
+ language_table_sim: 7
1022
+ gr1_isaac: 0
1023
+ sim_behavior_r1_pro: 31
1024
+ mecka_hands: 27
1025
+ real_r1_pro_sharpa: 28
1026
+ real_teleop_g1: 33
1027
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1028
+ metadata_versions:
1029
+ oxe_droid: '0221'
1030
+ agibot: '0221'
1031
+ yam: '0221'
1032
+ real_teleop_g1: '0221'
1033
+ fps:
1034
+ yam: 30
1035
+ real_teleop_g1: 30
1036
+ dataset_kwargs:
1037
+ video_backend: decord
1038
+ use_global_metadata: false
1039
+ max_chunk_size: 4
1040
+ relative_action: true
1041
+ relative_action_keys:
1042
+ - left_hand
1043
+ - right_hand
1044
+ - left_arm
1045
+ - right_arm
1046
+ - rpy
1047
+ - height
1048
+ relative_action_per_horizon: false
1049
+ mixture_kwargs:
1050
+ training: true
1051
+ balance_dataset_weights: false
1052
+ seed: 42
1053
+ shard_sampling_rate: 0.1
1054
+ trainer:
1055
+ _target_: groot.vla.experiment.VLATrainer
1056
+ _partial_: true
1057
+ _recursive_: false
1058
+ callbacks: null
1059
+ model: ???
1060
+ train_dataset: ???
1061
+ compute_dtype: ???
1062
+ benchmark_time: false
1063
+ enable_profiling: false
1064
+ profiling_steps: 5
1065
+ enable_prof_callback: false
1066
+ profile_start_step: 50
1067
+ profile_warmup_steps: 1
1068
+ profile_active_steps: 3
1069
+ profile_record_shapes: false
1070
+ profile_with_stack: false
1071
+ profile_memory: false
1072
+ wandb_project: dreamzero
1073
+ output_dir: ./checkpoints/dreamzero_real_teleop_g1_full_finetune
1074
+ load_from_yaml: null
1075
+ gear_credentials: null
1076
+ upload_checkpoints: false
1077
+ upload_every: 1000
1078
+ upload_last_n_checkpoints: 5
1079
+ remove_unused_columns: false
1080
+ bf16: true
1081
+ tf32: true
1082
+ global_batch_size: null
1083
+ raise_error_if_global_batch_size_not_set: false
1084
+ per_device_train_batch_size: 1
1085
+ per_device_eval_batch_size: 64
1086
+ gradient_accumulation_steps: 1
1087
+ dataloader_num_workers: 1
1088
+ dataloader_pin_memory: false
1089
+ dataloader_persistent_workers: true
1090
+ optim: adamw_torch
1091
+ learning_rate: 0.0001
1092
+ adam_beta1: 0.95
1093
+ adam_beta2: 0.999
1094
+ adam_epsilon: 1.0e-08
1095
+ weight_decay: 1.0e-05
1096
+ lr_scheduler_type: cosine
1097
+ warmup_ratio: 0.05
1098
+ logging_steps: 10.0
1099
+ num_train_epochs: 1000
1100
+ max_steps: 20000
1101
+ save_strategy: steps
1102
+ save_steps: 8000
1103
+ eval_strategy: 'no'
1104
+ save_total_limit: 10
1105
+ report_to: wandb
1106
+ seed: 42
1107
+ do_eval: false
1108
+ gradient_checkpointing: false
1109
+ ddp_find_unused_parameters: false
1110
+ ddp_bucket_cap_mb: 100
1111
+ ray_num_workers: ???
1112
+ eval_bf16: true
1113
+ torch_compile_mode: null
1114
+ pretrained_model_path: /hfm/boqian/liboqian_data/checkpoints/DreamZero-AgiBot
1115
+ only_tune_projectors: false
1116
+ save_llm: false
1117
+ save_lora_only: false
1118
+ save_value_model: false
1119
+ save_q_model: false
1120
+ download_cache: false
1121
+ training_args:
1122
+ _target_: transformers.TrainingArguments
1123
+ output_dir: ./checkpoints/dreamzero_real_teleop_g1_full_finetune
1124
+ run_name: dreamzero_real_teleop_g1_full_finetune
1125
+ remove_unused_columns: false
1126
+ deepspeed: groot/vla/configs/deepspeed/zero2_offload.json
1127
+ gradient_checkpointing: false
1128
+ bf16: true
1129
+ tf32: true
1130
+ per_device_train_batch_size: 1
1131
+ per_device_eval_batch_size: 64
1132
+ gradient_accumulation_steps: 1
1133
+ dataloader_num_workers: 1
1134
+ dataloader_pin_memory: false
1135
+ dataloader_persistent_workers: true
1136
+ optim: adamw_torch
1137
+ adam_beta1: 0.95
1138
+ adam_beta2: 0.999
1139
+ adam_epsilon: 1.0e-08
1140
+ learning_rate: 1.0e-05
1141
+ weight_decay: 1.0e-05
1142
+ warmup_ratio: 0.05
1143
+ lr_scheduler_type: cosine
1144
+ logging_steps: 10.0
1145
+ num_train_epochs: 1000
1146
+ max_steps: 20000
1147
+ save_strategy: steps
1148
+ save_steps: 8000
1149
+ save_total_limit: 10
1150
+ report_to: wandb
1151
+ seed: 42
1152
+ do_eval: false
1153
+ ddp_find_unused_parameters: false
1154
+ ddp_bucket_cap_mb: 100
1155
+ torch_compile_mode: null
1156
+ profile_dir: null
1157
+ backbone_hidden_size: 0
1158
+ backbone_cfg:
1159
+ _target_: groot.vla.model.dreamzero.backbone.identity.IdentityBackbone
1160
+ action_head_cfg:
1161
+ config:
1162
+ backbone_features_projector_cfg: null
1163
+ _target_: groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHeadConfig
1164
+ _recursive_: false
1165
+ tiled: false
1166
+ tile_size_height: 34
1167
+ tile_size_width: 34
1168
+ tile_stride_height: 18
1169
+ tile_stride_width: 16
1170
+ lora_rank: 4
1171
+ lora_alpha: 4
1172
+ num_frames: 33
1173
+ num_frame_per_block: 2
1174
+ lora_target_modules: q,k,v,o,ffn.0,ffn.2
1175
+ init_lora_weights: kaiming
1176
+ train_architecture: full
1177
+ use_gradient_checkpointing: true
1178
+ add_pos_embed: true
1179
+ model_dtype: float32
1180
+ max_state_dim: 64
1181
+ max_action_dim: 36
1182
+ action_loss_embodiment_ids:
1183
+ - 26
1184
+ - 17
1185
+ - 32
1186
+ hidden_size: 64
1187
+ input_embedding_dim: 1536
1188
+ backbone_embedding_dim: 0
1189
+ repa_layer: 8
1190
+ repa_coeff: 1.0
1191
+ load_pretrained_det_decode_layer_path: null
1192
+ freeze_decode_layer: false
1193
+ expand_batch: null
1194
+ use_vlln: true
1195
+ vl_self_attention_cfg:
1196
+ _target_: groot.vla.model.n1_5.modules.cross_attention_dit.SelfAttentionTransformer
1197
+ positional_embeddings: null
1198
+ num_layers: 4
1199
+ num_attention_heads: 24
1200
+ attention_head_dim: 64
1201
+ dropout: 0.2
1202
+ final_dropout: true
1203
+ diffusion_model_cfg:
1204
+ _target_: groot.vla.model.dreamzero.modules.wan_video_dit_action_casual_chunk.CausalWanModel
1205
+ _convert_: object
1206
+ diffusion_model_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P
1207
+ model_type: i2v
1208
+ frame_seqlen: 220
1209
+ dim: 5120
1210
+ in_dim: 36
1211
+ ffn_dim: 13824
1212
+ out_dim: 16
1213
+ freq_dim: 256
1214
+ eps: 1.0e-06
1215
+ num_heads: 40
1216
+ num_layers: 40
1217
+ max_chunk_size: 4
1218
+ num_frame_per_block: 2
1219
+ num_action_per_block: 24
1220
+ num_state_per_block: 1
1221
+ action_dim: 36
1222
+ text_encoder_cfg:
1223
+ _target_: groot.vla.model.dreamzero.modules.wan_video_text_encoder.WanTextEncoder
1224
+ _convert_: object
1225
+ text_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth
1226
+ image_encoder_cfg:
1227
+ _target_: groot.vla.model.dreamzero.modules.wan_video_image_encoder.WanImageEncoder
1228
+ _convert_: object
1229
+ image_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth
1230
+ vae_cfg:
1231
+ _target_: groot.vla.model.dreamzero.modules.wan_video_vae.WanVideoVAE
1232
+ _convert_: object
1233
+ vae_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/Wan2.1_VAE.pth
1234
+ action_dim: 36
1235
+ action_horizon: 24
1236
+ num_inference_timesteps: 4
1237
+ noise_beta_alpha: 1.5
1238
+ noise_beta_beta: 1.0
1239
+ noise_s: 0.999
1240
+ num_timestep_buckets: 1000
1241
+ decouple_video_action_noise: false
1242
+ video_noise_beta_alpha: 3.0
1243
+ video_noise_beta_beta: 1.0
1244
+ tune_projector: true
1245
+ tune_diffusion_model: true
1246
+ skip_component_loading: true
1247
+ _target_: groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead
1248
+ _convert_: object
1249
+ add_pos_embed: true
1250
+ hidden_size: 64
1251
+ attn_dropout: 0.2
1252
+ repa_layer: 8
1253
+ repa_coeff: 1.0
1254
+ load_pretrained_det_decode_layer_path: null
1255
+ expand_batch: null
1256
+ dit_version: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P
1257
+ text_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth
1258
+ image_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth
1259
+ vae_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/Wan2.1_VAE.pth
1260
+ train_architecture: full
1261
+ num_frame_per_block: 2
1262
+ num_action_per_block: 24
1263
+ num_state_per_block: 1
1264
+ frame_seqlen: 220
1265
+ embodiment_tag_to_projector_index:
1266
+ real_gr1_arms_only: 0
1267
+ real_gr1_arms_only_annotated: 1
1268
+ real_gr1_arms_waist: 2
1269
+ real_gr1_arms_waist_annotated: 3
1270
+ dexmg_gr1_arms_only_inspire: 4
1271
+ dexmg_gr1_arms_only_fourier: 5
1272
+ dexmg_gr1_arms_waist_fourier: 6
1273
+ robocasa_single_arm: 7
1274
+ onex_eve_gripper: 8
1275
+ robocasa_gr1_arms_only_inspire_hands: 9
1276
+ robocasa_gr1_arms_only_fourier_hands: 10
1277
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1278
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1279
+ robocasa_panda_omron: 13
1280
+ robocasa_bimanual_panda_parallel_gripper: 15
1281
+ robocasa_bimanual_panda_inspire_hand: 16
1282
+ oxe_droid: 17
1283
+ oxe_fractal: 18
1284
+ oxe_language_table: 19
1285
+ oxe_bridge: 20
1286
+ real_panda_single_arm: 21
1287
+ hot3d_hands_only: 23
1288
+ gr1_unified: 24
1289
+ robocasa_gr1_arms_waist_fourier_hands: 25
1290
+ agibot: 26
1291
+ lapa: 27
1292
+ oxe_mutex: 28
1293
+ oxe_roboset: 29
1294
+ oxe_plex: 30
1295
+ dream: 31
1296
+ yam: 32
1297
+ xdof: 22
1298
+ gr1_unified_segmentation: 14
1299
+ language_table_sim: 7
1300
+ gr1_isaac: 0
1301
+ sim_behavior_r1_pro: 31
1302
+ mecka_hands: 27
1303
+ real_r1_pro_sharpa: 28
1304
+ real_teleop_g1: 33
1305
+ max_length: 512
1306
+ num_views: 1
1307
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1308
+ data_collator:
1309
+ _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DefaultDataCollator
1310
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1311
+ max_length: 512
1312
+ num_views: 1
1313
+ embodiment_tag_mapping:
1314
+ real_gr1_arms_only: 0
1315
+ real_gr1_arms_only_annotated: 1
1316
+ real_gr1_arms_waist: 2
1317
+ real_gr1_arms_waist_annotated: 3
1318
+ dexmg_gr1_arms_only_inspire: 4
1319
+ dexmg_gr1_arms_only_fourier: 5
1320
+ dexmg_gr1_arms_waist_fourier: 6
1321
+ robocasa_single_arm: 7
1322
+ onex_eve_gripper: 8
1323
+ robocasa_gr1_arms_only_inspire_hands: 9
1324
+ robocasa_gr1_arms_only_fourier_hands: 10
1325
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1326
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1327
+ robocasa_panda_omron: 13
1328
+ robocasa_bimanual_panda_parallel_gripper: 15
1329
+ robocasa_bimanual_panda_inspire_hand: 16
1330
+ oxe_droid: 17
1331
+ oxe_fractal: 18
1332
+ oxe_language_table: 19
1333
+ oxe_bridge: 20
1334
+ real_panda_single_arm: 21
1335
+ hot3d_hands_only: 23
1336
+ gr1_unified: 24
1337
+ robocasa_gr1_arms_waist_fourier_hands: 25
1338
+ agibot: 26
1339
+ lapa: 27
1340
+ oxe_mutex: 28
1341
+ oxe_roboset: 29
1342
+ oxe_plex: 30
1343
+ dream: 31
1344
+ yam: 32
1345
+ xdof: 22
1346
+ gr1_unified_segmentation: 14
1347
+ language_table_sim: 7
1348
+ gr1_isaac: 0
1349
+ sim_behavior_r1_pro: 31
1350
+ mecka_hands: 27
1351
+ real_r1_pro_sharpa: 28
1352
+ real_teleop_g1: 33
1353
+ num_visual_tokens_per_frame: 16
1354
+ max_state_dim: 64
1355
+ max_action_dim: 36
1356
+ language_dropout_prob: 0.0
1357
+ model_specific_transform:
1358
+ _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
1359
+ default_instruction: Perform the default behavior.
1360
+ language_dropout_prob: 0.0
1361
+ always_use_default_instruction: false
1362
+ max_state_dim: 64
1363
+ max_action_dim: 36
1364
+ max_length: 512
1365
+ state_horizon: 1
1366
+ action_horizon: 24
1367
+ embodiment_tag_mapping:
1368
+ real_gr1_arms_only: 0
1369
+ real_gr1_arms_only_annotated: 1
1370
+ real_gr1_arms_waist: 2
1371
+ real_gr1_arms_waist_annotated: 3
1372
+ dexmg_gr1_arms_only_inspire: 4
1373
+ dexmg_gr1_arms_only_fourier: 5
1374
+ dexmg_gr1_arms_waist_fourier: 6
1375
+ robocasa_single_arm: 7
1376
+ onex_eve_gripper: 8
1377
+ robocasa_gr1_arms_only_inspire_hands: 9
1378
+ robocasa_gr1_arms_only_fourier_hands: 10
1379
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1380
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1381
+ robocasa_panda_omron: 13
1382
+ robocasa_bimanual_panda_parallel_gripper: 15
1383
+ robocasa_bimanual_panda_inspire_hand: 16
1384
+ oxe_droid: 17
1385
+ oxe_fractal: 18
1386
+ oxe_language_table: 19
1387
+ oxe_bridge: 20
1388
+ real_panda_single_arm: 21
1389
+ hot3d_hands_only: 23
1390
+ gr1_unified: 24
1391
+ robocasa_gr1_arms_waist_fourier_hands: 25
1392
+ agibot: 26
1393
+ lapa: 27
1394
+ oxe_mutex: 28
1395
+ oxe_roboset: 29
1396
+ oxe_plex: 30
1397
+ dream: 31
1398
+ yam: 32
1399
+ xdof: 22
1400
+ gr1_unified_segmentation: 14
1401
+ language_table_sim: 7
1402
+ gr1_isaac: 0
1403
+ sim_behavior_r1_pro: 31
1404
+ mecka_hands: 27
1405
+ real_r1_pro_sharpa: 28
1406
+ real_teleop_g1: 33
1407
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1408
+ use_global_metadata: false
1409
+ num_frames: 33
1410
+ action_horizon: 24
1411
+ state_horizon: 1
1412
+ image_resolution_width: 320
1413
+ image_resolution_height: 176
1414
+ image_resolution_width_single_frame: 256
1415
+ image_resolution_height_single_frame: 256
1416
+ totensor_cfg:
1417
+ _target_: groot.vla.data.transform.VideoToTensor
1418
+ apply_to: ???
1419
+ crop_cfg:
1420
+ _target_: groot.vla.data.transform.VideoCrop
1421
+ apply_to: ???
1422
+ scale: 0.95
1423
+ mode: random
1424
+ resize_cfg:
1425
+ _target_: groot.vla.data.transform.VideoResize
1426
+ apply_to: ???
1427
+ height: 176
1428
+ width: 320
1429
+ interpolation: linear
1430
+ resize_cfg_single_frame:
1431
+ _target_: groot.vla.data.transform.VideoResize
1432
+ apply_to: ???
1433
+ height: 256
1434
+ width: 256
1435
+ interpolation: linear
1436
+ color_jitter_cfg:
1437
+ _target_: groot.vla.data.transform.VideoColorJitter
1438
+ apply_to: ???
1439
+ brightness: 0.3
1440
+ contrast: 0.4
1441
+ saturation: 0.5
1442
+ hue: 0.08
1443
+ random_grayscale_cfg:
1444
+ _target_: groot.vla.data.transform.VideoRandomGrayscale
1445
+ apply_to: ???
1446
+ p: 0.1
1447
+ random_posterize_cfg:
1448
+ _target_: groot.vla.data.transform.VideoRandomPosterize
1449
+ apply_to: ???
1450
+ bits: 4
1451
+ p: 0.1
1452
+ normalize_cfg:
1453
+ _target_: groot.vla.data.transform.VideoNormalize
1454
+ apply_to: ???
1455
+ mean:
1456
+ - 0.5
1457
+ - 0.5
1458
+ - 0.5
1459
+ std:
1460
+ - 0.5
1461
+ - 0.5
1462
+ - 0.5
1463
+ to_numpy_cfg:
1464
+ _target_: groot.vla.data.transform.VideoToNumpy
1465
+ apply_to: ???
1466
+ modality_config_oxe_droid:
1467
+ video:
1468
+ _target_: groot.vla.data.dataset.ModalityConfig
1469
+ delta_indices:
1470
+ - 0
1471
+ - 1
1472
+ - 2
1473
+ - 3
1474
+ - 4
1475
+ - 5
1476
+ - 6
1477
+ - 7
1478
+ - 8
1479
+ - 9
1480
+ - 10
1481
+ - 11
1482
+ - 12
1483
+ - 13
1484
+ - 14
1485
+ - 15
1486
+ - 16
1487
+ - 17
1488
+ - 18
1489
+ - 19
1490
+ - 20
1491
+ - 21
1492
+ - 22
1493
+ - 23
1494
+ - 24
1495
+ eval_delta_indices:
1496
+ - 0
1497
+ modality_keys:
1498
+ - video.exterior_image_1_left
1499
+ - video.exterior_image_2_left
1500
+ - video.wrist_image_left
1501
+ state:
1502
+ _target_: groot.vla.data.dataset.ModalityConfig
1503
+ delta_indices:
1504
+ - 0
1505
+ modality_keys:
1506
+ - state.joint_position
1507
+ - state.gripper_position
1508
+ action:
1509
+ _target_: groot.vla.data.dataset.ModalityConfig
1510
+ delta_indices:
1511
+ - 0
1512
+ - 1
1513
+ - 2
1514
+ - 3
1515
+ - 4
1516
+ - 5
1517
+ - 6
1518
+ - 7
1519
+ - 8
1520
+ - 9
1521
+ - 10
1522
+ - 11
1523
+ - 12
1524
+ - 13
1525
+ - 14
1526
+ - 15
1527
+ - 16
1528
+ - 17
1529
+ - 18
1530
+ - 19
1531
+ - 20
1532
+ - 21
1533
+ - 22
1534
+ - 23
1535
+ modality_keys:
1536
+ - action.joint_position
1537
+ - action.gripper_position
1538
+ language:
1539
+ _target_: groot.vla.data.dataset.ModalityConfig
1540
+ delta_indices:
1541
+ - 0
1542
+ modality_keys:
1543
+ - annotation.language.language_instruction
1544
+ - annotation.language.language_instruction_2
1545
+ - annotation.language.language_instruction_3
1546
+ lapa_action:
1547
+ _target_: groot.vla.data.dataset.ModalityConfig
1548
+ delta_indices:
1549
+ - 0
1550
+ modality_keys:
1551
+ - lapa_action
1552
+ transform_oxe_droid:
1553
+ _target_: groot.vla.data.transform.ComposedModalityTransform
1554
+ transforms:
1555
+ - _target_: groot.vla.data.transform.VideoToTensor
1556
+ apply_to:
1557
+ - video.exterior_image_1_left
1558
+ - video.exterior_image_2_left
1559
+ - video.wrist_image_left
1560
+ - _target_: groot.vla.data.transform.VideoCrop
1561
+ apply_to:
1562
+ - video.exterior_image_1_left
1563
+ - video.exterior_image_2_left
1564
+ - video.wrist_image_left
1565
+ scale: 0.95
1566
+ mode: random
1567
+ - _target_: groot.vla.data.transform.VideoResize
1568
+ apply_to:
1569
+ - video.exterior_image_1_left
1570
+ - video.exterior_image_2_left
1571
+ - video.wrist_image_left
1572
+ height: 176
1573
+ width: 320
1574
+ interpolation: linear
1575
+ - _target_: groot.vla.data.transform.VideoColorJitter
1576
+ apply_to:
1577
+ - video.exterior_image_1_left
1578
+ - video.exterior_image_2_left
1579
+ - video.wrist_image_left
1580
+ brightness: 0.3
1581
+ contrast: 0.4
1582
+ saturation: 0.5
1583
+ hue: 0.08
1584
+ - _target_: groot.vla.data.transform.VideoToNumpy
1585
+ apply_to:
1586
+ - video.exterior_image_1_left
1587
+ - video.exterior_image_2_left
1588
+ - video.wrist_image_left
1589
+ - _target_: groot.vla.data.transform.StateActionToTensor
1590
+ apply_to:
1591
+ - state.joint_position
1592
+ - state.gripper_position
1593
+ - _target_: groot.vla.data.transform.StateActionTransform
1594
+ apply_to:
1595
+ - state.joint_position
1596
+ - state.gripper_position
1597
+ normalization_modes:
1598
+ state.joint_position: q99
1599
+ state.gripper_position: q99
1600
+ - _target_: groot.vla.data.transform.StateActionToTensor
1601
+ apply_to:
1602
+ - action.joint_position
1603
+ - action.gripper_position
1604
+ - _target_: groot.vla.data.transform.StateActionTransform
1605
+ apply_to:
1606
+ - action.joint_position
1607
+ - action.gripper_position
1608
+ normalization_modes:
1609
+ action.joint_position: q99
1610
+ action.gripper_position: q99
1611
+ - _target_: groot.vla.data.transform.ConcatTransform
1612
+ video_concat_order:
1613
+ - video.exterior_image_1_left
1614
+ - video.exterior_image_2_left
1615
+ - video.wrist_image_left
1616
+ state_concat_order:
1617
+ - state.joint_position
1618
+ - state.gripper_position
1619
+ action_concat_order:
1620
+ - action.joint_position
1621
+ - action.gripper_position
1622
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
1623
+ default_instruction: Perform the default behavior.
1624
+ language_dropout_prob: 0.0
1625
+ always_use_default_instruction: false
1626
+ max_state_dim: 64
1627
+ max_action_dim: 36
1628
+ max_length: 512
1629
+ state_horizon: 1
1630
+ action_horizon: 24
1631
+ embodiment_tag_mapping:
1632
+ real_gr1_arms_only: 0
1633
+ real_gr1_arms_only_annotated: 1
1634
+ real_gr1_arms_waist: 2
1635
+ real_gr1_arms_waist_annotated: 3
1636
+ dexmg_gr1_arms_only_inspire: 4
1637
+ dexmg_gr1_arms_only_fourier: 5
1638
+ dexmg_gr1_arms_waist_fourier: 6
1639
+ robocasa_single_arm: 7
1640
+ onex_eve_gripper: 8
1641
+ robocasa_gr1_arms_only_inspire_hands: 9
1642
+ robocasa_gr1_arms_only_fourier_hands: 10
1643
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1644
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1645
+ robocasa_panda_omron: 13
1646
+ robocasa_bimanual_panda_parallel_gripper: 15
1647
+ robocasa_bimanual_panda_inspire_hand: 16
1648
+ oxe_droid: 17
1649
+ oxe_fractal: 18
1650
+ oxe_language_table: 19
1651
+ oxe_bridge: 20
1652
+ real_panda_single_arm: 21
1653
+ hot3d_hands_only: 23
1654
+ gr1_unified: 24
1655
+ robocasa_gr1_arms_waist_fourier_hands: 25
1656
+ agibot: 26
1657
+ lapa: 27
1658
+ oxe_mutex: 28
1659
+ oxe_roboset: 29
1660
+ oxe_plex: 30
1661
+ dream: 31
1662
+ yam: 32
1663
+ xdof: 22
1664
+ gr1_unified_segmentation: 14
1665
+ language_table_sim: 7
1666
+ gr1_isaac: 0
1667
+ sim_behavior_r1_pro: 31
1668
+ mecka_hands: 27
1669
+ real_r1_pro_sharpa: 28
1670
+ real_teleop_g1: 33
1671
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1672
+ modality_config_agibot:
1673
+ video:
1674
+ _target_: groot.vla.data.dataset.ModalityConfig
1675
+ delta_indices:
1676
+ - 0
1677
+ - 1
1678
+ - 2
1679
+ - 3
1680
+ - 4
1681
+ - 5
1682
+ - 6
1683
+ - 7
1684
+ - 8
1685
+ - 9
1686
+ - 10
1687
+ - 11
1688
+ - 12
1689
+ - 13
1690
+ - 14
1691
+ - 15
1692
+ - 16
1693
+ - 17
1694
+ - 18
1695
+ - 19
1696
+ - 20
1697
+ - 21
1698
+ - 22
1699
+ - 23
1700
+ - 24
1701
+ eval_delta_indices:
1702
+ - -3
1703
+ - -2
1704
+ - -1
1705
+ - 0
1706
+ modality_keys:
1707
+ - video.top_head
1708
+ - video.hand_left
1709
+ - video.hand_right
1710
+ state:
1711
+ _target_: groot.vla.data.dataset.ModalityConfig
1712
+ delta_indices:
1713
+ - 0
1714
+ modality_keys:
1715
+ - state.left_arm_joint_position
1716
+ - state.right_arm_joint_position
1717
+ - state.left_effector_position
1718
+ - state.right_effector_position
1719
+ - state.head_position
1720
+ - state.waist_position
1721
+ action:
1722
+ _target_: groot.vla.data.dataset.ModalityConfig
1723
+ delta_indices:
1724
+ - 0
1725
+ - 1
1726
+ - 2
1727
+ - 3
1728
+ - 4
1729
+ - 5
1730
+ - 6
1731
+ - 7
1732
+ - 8
1733
+ - 9
1734
+ - 10
1735
+ - 11
1736
+ - 12
1737
+ - 13
1738
+ - 14
1739
+ - 15
1740
+ - 16
1741
+ - 17
1742
+ - 18
1743
+ - 19
1744
+ - 20
1745
+ - 21
1746
+ - 22
1747
+ - 23
1748
+ modality_keys:
1749
+ - action.left_arm_joint_position
1750
+ - action.right_arm_joint_position
1751
+ - action.left_effector_position
1752
+ - action.right_effector_position
1753
+ - action.head_position
1754
+ - action.waist_position
1755
+ - action.robot_velocity
1756
+ language:
1757
+ _target_: groot.vla.data.dataset.ModalityConfig
1758
+ delta_indices:
1759
+ - 0
1760
+ modality_keys:
1761
+ - annotation.language.action_text
1762
+ transform_agibot:
1763
+ _target_: groot.vla.data.transform.ComposedModalityTransform
1764
+ transforms:
1765
+ - _target_: groot.vla.data.transform.VideoToTensor
1766
+ apply_to:
1767
+ - video.top_head
1768
+ - video.hand_left
1769
+ - video.hand_right
1770
+ - _target_: groot.vla.data.transform.VideoCrop
1771
+ apply_to:
1772
+ - video.top_head
1773
+ - video.hand_left
1774
+ - video.hand_right
1775
+ scale: 0.95
1776
+ mode: random
1777
+ - _target_: groot.vla.data.transform.VideoResize
1778
+ apply_to:
1779
+ - video.top_head
1780
+ - video.hand_left
1781
+ - video.hand_right
1782
+ height: 176
1783
+ width: 320
1784
+ interpolation: linear
1785
+ - _target_: groot.vla.data.transform.VideoColorJitter
1786
+ apply_to:
1787
+ - video.top_head
1788
+ - video.hand_left
1789
+ - video.hand_right
1790
+ brightness: 0.3
1791
+ contrast: 0.4
1792
+ saturation: 0.5
1793
+ hue: 0.08
1794
+ - _target_: groot.vla.data.transform.VideoToNumpy
1795
+ apply_to:
1796
+ - video.top_head
1797
+ - video.hand_left
1798
+ - video.hand_right
1799
+ - _target_: groot.vla.data.transform.StateActionToTensor
1800
+ apply_to:
1801
+ - state.left_arm_joint_position
1802
+ - state.right_arm_joint_position
1803
+ - state.left_effector_position
1804
+ - state.right_effector_position
1805
+ - state.head_position
1806
+ - state.waist_position
1807
+ - _target_: groot.vla.data.transform.StateActionTransform
1808
+ apply_to:
1809
+ - state.left_arm_joint_position
1810
+ - state.right_arm_joint_position
1811
+ - state.left_effector_position
1812
+ - state.right_effector_position
1813
+ - state.head_position
1814
+ - state.waist_position
1815
+ normalization_modes:
1816
+ state.left_arm_joint_position: q99
1817
+ state.right_arm_joint_position: q99
1818
+ state.left_effector_position: q99
1819
+ state.right_effector_position: q99
1820
+ state.head_position: q99
1821
+ state.waist_position: q99
1822
+ - _target_: groot.vla.data.transform.StateActionToTensor
1823
+ apply_to:
1824
+ - action.left_arm_joint_position
1825
+ - action.right_arm_joint_position
1826
+ - action.left_effector_position
1827
+ - action.right_effector_position
1828
+ - action.head_position
1829
+ - action.waist_position
1830
+ - action.robot_velocity
1831
+ - _target_: groot.vla.data.transform.StateActionTransform
1832
+ apply_to:
1833
+ - action.left_arm_joint_position
1834
+ - action.right_arm_joint_position
1835
+ - action.left_effector_position
1836
+ - action.right_effector_position
1837
+ - action.head_position
1838
+ - action.waist_position
1839
+ - action.robot_velocity
1840
+ normalization_modes:
1841
+ action.left_arm_joint_position: q99
1842
+ action.right_arm_joint_position: q99
1843
+ action.left_effector_position: q99
1844
+ action.right_effector_position: q99
1845
+ action.head_position: q99
1846
+ action.waist_position: q99
1847
+ action.robot_velocity: q99
1848
+ - _target_: groot.vla.data.transform.ConcatTransform
1849
+ video_concat_order:
1850
+ - video.top_head
1851
+ - video.hand_left
1852
+ - video.hand_right
1853
+ state_concat_order:
1854
+ - state.left_arm_joint_position
1855
+ - state.right_arm_joint_position
1856
+ - state.left_effector_position
1857
+ - state.right_effector_position
1858
+ - state.head_position
1859
+ - state.waist_position
1860
+ action_concat_order:
1861
+ - action.left_arm_joint_position
1862
+ - action.right_arm_joint_position
1863
+ - action.left_effector_position
1864
+ - action.right_effector_position
1865
+ - action.head_position
1866
+ - action.waist_position
1867
+ - action.robot_velocity
1868
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
1869
+ default_instruction: Perform the default behavior.
1870
+ language_dropout_prob: 0.0
1871
+ always_use_default_instruction: false
1872
+ max_state_dim: 64
1873
+ max_action_dim: 36
1874
+ max_length: 512
1875
+ state_horizon: 1
1876
+ action_horizon: 24
1877
+ embodiment_tag_mapping:
1878
+ real_gr1_arms_only: 0
1879
+ real_gr1_arms_only_annotated: 1
1880
+ real_gr1_arms_waist: 2
1881
+ real_gr1_arms_waist_annotated: 3
1882
+ dexmg_gr1_arms_only_inspire: 4
1883
+ dexmg_gr1_arms_only_fourier: 5
1884
+ dexmg_gr1_arms_waist_fourier: 6
1885
+ robocasa_single_arm: 7
1886
+ onex_eve_gripper: 8
1887
+ robocasa_gr1_arms_only_inspire_hands: 9
1888
+ robocasa_gr1_arms_only_fourier_hands: 10
1889
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1890
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1891
+ robocasa_panda_omron: 13
1892
+ robocasa_bimanual_panda_parallel_gripper: 15
1893
+ robocasa_bimanual_panda_inspire_hand: 16
1894
+ oxe_droid: 17
1895
+ oxe_fractal: 18
1896
+ oxe_language_table: 19
1897
+ oxe_bridge: 20
1898
+ real_panda_single_arm: 21
1899
+ hot3d_hands_only: 23
1900
+ gr1_unified: 24
1901
+ robocasa_gr1_arms_waist_fourier_hands: 25
1902
+ agibot: 26
1903
+ lapa: 27
1904
+ oxe_mutex: 28
1905
+ oxe_roboset: 29
1906
+ oxe_plex: 30
1907
+ dream: 31
1908
+ yam: 32
1909
+ xdof: 22
1910
+ gr1_unified_segmentation: 14
1911
+ language_table_sim: 7
1912
+ gr1_isaac: 0
1913
+ sim_behavior_r1_pro: 31
1914
+ mecka_hands: 27
1915
+ real_r1_pro_sharpa: 28
1916
+ real_teleop_g1: 33
1917
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1918
+ modality_config_yam:
1919
+ video:
1920
+ _target_: groot.vla.data.dataset.ModalityConfig
1921
+ delta_indices:
1922
+ - 0
1923
+ - 1
1924
+ - 2
1925
+ - 3
1926
+ - 4
1927
+ - 5
1928
+ - 6
1929
+ - 7
1930
+ - 8
1931
+ - 9
1932
+ - 10
1933
+ - 11
1934
+ - 12
1935
+ - 13
1936
+ - 14
1937
+ - 15
1938
+ - 16
1939
+ - 17
1940
+ - 18
1941
+ - 19
1942
+ - 20
1943
+ - 21
1944
+ - 22
1945
+ - 23
1946
+ - 24
1947
+ eval_delta_indices:
1948
+ - 0
1949
+ modality_keys:
1950
+ - video.top_camera-images-rgb
1951
+ - video.left_camera-images-rgb
1952
+ - video.right_camera-images-rgb
1953
+ state:
1954
+ _target_: groot.vla.data.dataset.ModalityConfig
1955
+ delta_indices:
1956
+ - 0
1957
+ modality_keys:
1958
+ - state.left_joint_pos
1959
+ - state.left_gripper_pos
1960
+ - state.right_joint_pos
1961
+ - state.right_gripper_pos
1962
+ action:
1963
+ _target_: groot.vla.data.dataset.ModalityConfig
1964
+ delta_indices:
1965
+ - 0
1966
+ - 1
1967
+ - 2
1968
+ - 3
1969
+ - 4
1970
+ - 5
1971
+ - 6
1972
+ - 7
1973
+ - 8
1974
+ - 9
1975
+ - 10
1976
+ - 11
1977
+ - 12
1978
+ - 13
1979
+ - 14
1980
+ - 15
1981
+ - 16
1982
+ - 17
1983
+ - 18
1984
+ - 19
1985
+ - 20
1986
+ - 21
1987
+ - 22
1988
+ - 23
1989
+ modality_keys:
1990
+ - action.left_joint_pos
1991
+ - action.left_gripper_pos
1992
+ - action.right_joint_pos
1993
+ - action.right_gripper_pos
1994
+ language:
1995
+ _target_: groot.vla.data.dataset.ModalityConfig
1996
+ delta_indices:
1997
+ - 0
1998
+ modality_keys:
1999
+ - annotation.task
2000
+ transform_yam:
2001
+ _target_: groot.vla.data.transform.ComposedModalityTransform
2002
+ transforms:
2003
+ - _target_: groot.vla.data.transform.VideoToTensor
2004
+ apply_to:
2005
+ - video.top_camera-images-rgb
2006
+ - video.left_camera-images-rgb
2007
+ - video.right_camera-images-rgb
2008
+ - _target_: groot.vla.data.transform.VideoCrop
2009
+ apply_to:
2010
+ - video.top_camera-images-rgb
2011
+ - video.left_camera-images-rgb
2012
+ - video.right_camera-images-rgb
2013
+ scale: 0.95
2014
+ mode: random
2015
+ - _target_: groot.vla.data.transform.VideoResize
2016
+ apply_to:
2017
+ - video.top_camera-images-rgb
2018
+ - video.left_camera-images-rgb
2019
+ - video.right_camera-images-rgb
2020
+ height: 176
2021
+ width: 320
2022
+ interpolation: linear
2023
+ - _target_: groot.vla.data.transform.VideoColorJitter
2024
+ apply_to:
2025
+ - video.top_camera-images-rgb
2026
+ - video.left_camera-images-rgb
2027
+ - video.right_camera-images-rgb
2028
+ brightness: 0.3
2029
+ contrast: 0.4
2030
+ saturation: 0.5
2031
+ hue: 0.08
2032
+ - _target_: groot.vla.data.transform.VideoToNumpy
2033
+ apply_to:
2034
+ - video.top_camera-images-rgb
2035
+ - video.left_camera-images-rgb
2036
+ - video.right_camera-images-rgb
2037
+ - _target_: groot.vla.data.transform.StateActionToTensor
2038
+ apply_to:
2039
+ - state.left_joint_pos
2040
+ - state.left_gripper_pos
2041
+ - state.right_joint_pos
2042
+ - state.right_gripper_pos
2043
+ - _target_: groot.vla.data.transform.StateActionTransform
2044
+ apply_to:
2045
+ - state.left_joint_pos
2046
+ - state.left_gripper_pos
2047
+ - state.right_joint_pos
2048
+ - state.right_gripper_pos
2049
+ normalization_modes:
2050
+ state.left_joint_pos: q99
2051
+ state.left_gripper_pos: q99
2052
+ state.right_joint_pos: q99
2053
+ state.right_gripper_pos: q99
2054
+ - _target_: groot.vla.data.transform.StateActionToTensor
2055
+ apply_to:
2056
+ - action.left_joint_pos
2057
+ - action.left_gripper_pos
2058
+ - action.right_joint_pos
2059
+ - action.right_gripper_pos
2060
+ - _target_: groot.vla.data.transform.StateActionTransform
2061
+ apply_to:
2062
+ - action.left_joint_pos
2063
+ - action.left_gripper_pos
2064
+ - action.right_joint_pos
2065
+ - action.right_gripper_pos
2066
+ normalization_modes:
2067
+ action.left_joint_pos: q99
2068
+ action.left_gripper_pos: q99
2069
+ action.right_joint_pos: q99
2070
+ action.right_gripper_pos: q99
2071
+ - _target_: groot.vla.data.transform.ConcatTransform
2072
+ video_concat_order:
2073
+ - video.top_camera-images-rgb
2074
+ - video.left_camera-images-rgb
2075
+ - video.right_camera-images-rgb
2076
+ state_concat_order:
2077
+ - state.left_joint_pos
2078
+ - state.left_gripper_pos
2079
+ - state.right_joint_pos
2080
+ - state.right_gripper_pos
2081
+ action_concat_order:
2082
+ - action.left_joint_pos
2083
+ - action.left_gripper_pos
2084
+ - action.right_joint_pos
2085
+ - action.right_gripper_pos
2086
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
2087
+ default_instruction: Perform the default behavior.
2088
+ language_dropout_prob: 0.0
2089
+ always_use_default_instruction: false
2090
+ max_state_dim: 64
2091
+ max_action_dim: 36
2092
+ max_length: 512
2093
+ state_horizon: 1
2094
+ action_horizon: 24
2095
+ embodiment_tag_mapping:
2096
+ real_gr1_arms_only: 0
2097
+ real_gr1_arms_only_annotated: 1
2098
+ real_gr1_arms_waist: 2
2099
+ real_gr1_arms_waist_annotated: 3
2100
+ dexmg_gr1_arms_only_inspire: 4
2101
+ dexmg_gr1_arms_only_fourier: 5
2102
+ dexmg_gr1_arms_waist_fourier: 6
2103
+ robocasa_single_arm: 7
2104
+ onex_eve_gripper: 8
2105
+ robocasa_gr1_arms_only_inspire_hands: 9
2106
+ robocasa_gr1_arms_only_fourier_hands: 10
2107
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
2108
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
2109
+ robocasa_panda_omron: 13
2110
+ robocasa_bimanual_panda_parallel_gripper: 15
2111
+ robocasa_bimanual_panda_inspire_hand: 16
2112
+ oxe_droid: 17
2113
+ oxe_fractal: 18
2114
+ oxe_language_table: 19
2115
+ oxe_bridge: 20
2116
+ real_panda_single_arm: 21
2117
+ hot3d_hands_only: 23
2118
+ gr1_unified: 24
2119
+ robocasa_gr1_arms_waist_fourier_hands: 25
2120
+ agibot: 26
2121
+ lapa: 27
2122
+ oxe_mutex: 28
2123
+ oxe_roboset: 29
2124
+ oxe_plex: 30
2125
+ dream: 31
2126
+ yam: 32
2127
+ xdof: 22
2128
+ gr1_unified_segmentation: 14
2129
+ language_table_sim: 7
2130
+ gr1_isaac: 0
2131
+ sim_behavior_r1_pro: 31
2132
+ mecka_hands: 27
2133
+ real_r1_pro_sharpa: 28
2134
+ real_teleop_g1: 33
2135
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
2136
+ modality_config_real_teleop_g1:
2137
+ video:
2138
+ _target_: groot.vla.data.dataset.ModalityConfig
2139
+ delta_indices:
2140
+ - 0
2141
+ - 1
2142
+ - 2
2143
+ - 3
2144
+ - 4
2145
+ - 5
2146
+ - 6
2147
+ - 7
2148
+ - 8
2149
+ - 9
2150
+ - 10
2151
+ - 11
2152
+ - 12
2153
+ - 13
2154
+ - 14
2155
+ - 15
2156
+ - 16
2157
+ - 17
2158
+ - 18
2159
+ - 19
2160
+ - 20
2161
+ - 21
2162
+ - 22
2163
+ - 23
2164
+ - 24
2165
+ eval_delta_indices:
2166
+ - 0
2167
+ modality_keys:
2168
+ - video.egocentric
2169
+ state:
2170
+ _target_: groot.vla.data.dataset.ModalityConfig
2171
+ delta_indices:
2172
+ - 0
2173
+ modality_keys:
2174
+ - state.left_hand
2175
+ - state.right_hand
2176
+ - state.left_arm
2177
+ - state.right_arm
2178
+ - state.rpy
2179
+ - state.height
2180
+ action:
2181
+ _target_: groot.vla.data.dataset.ModalityConfig
2182
+ delta_indices:
2183
+ - 0
2184
+ - 1
2185
+ - 2
2186
+ - 3
2187
+ - 4
2188
+ - 5
2189
+ - 6
2190
+ - 7
2191
+ - 8
2192
+ - 9
2193
+ - 10
2194
+ - 11
2195
+ - 12
2196
+ - 13
2197
+ - 14
2198
+ - 15
2199
+ - 16
2200
+ - 17
2201
+ - 18
2202
+ - 19
2203
+ - 20
2204
+ - 21
2205
+ - 22
2206
+ - 23
2207
+ modality_keys:
2208
+ - action.left_hand
2209
+ - action.right_hand
2210
+ - action.left_arm
2211
+ - action.right_arm
2212
+ - action.rpy
2213
+ - action.height
2214
+ - action.torso_vx
2215
+ - action.torso_vy
2216
+ - action.torso_vyaw
2217
+ - action.torso_dyaw
2218
+ language:
2219
+ _target_: groot.vla.data.dataset.ModalityConfig
2220
+ delta_indices:
2221
+ - 0
2222
+ modality_keys:
2223
+ - annotation.language.language_instruction
2224
+ transform_real_teleop_g1:
2225
+ _target_: groot.vla.data.transform.ComposedModalityTransform
2226
+ transforms:
2227
+ - _target_: groot.vla.data.transform.VideoToTensor
2228
+ apply_to:
2229
+ - video.egocentric
2230
+ - _target_: groot.vla.data.transform.VideoCrop
2231
+ apply_to:
2232
+ - video.egocentric
2233
+ scale: 0.95
2234
+ mode: random
2235
+ - _target_: groot.vla.data.transform.VideoResize
2236
+ apply_to:
2237
+ - video.egocentric
2238
+ height: 176
2239
+ width: 320
2240
+ interpolation: linear
2241
+ - _target_: groot.vla.data.transform.VideoColorJitter
2242
+ apply_to:
2243
+ - video.egocentric
2244
+ brightness: 0.3
2245
+ contrast: 0.4
2246
+ saturation: 0.5
2247
+ hue: 0.08
2248
+ - _target_: groot.vla.data.transform.VideoToNumpy
2249
+ apply_to:
2250
+ - video.egocentric
2251
+ - _target_: groot.vla.data.transform.StateActionToTensor
2252
+ apply_to:
2253
+ - state.left_hand
2254
+ - state.right_hand
2255
+ - state.left_arm
2256
+ - state.right_arm
2257
+ - state.rpy
2258
+ - state.height
2259
+ - _target_: groot.vla.data.transform.StateActionTransform
2260
+ apply_to:
2261
+ - state.left_hand
2262
+ - state.right_hand
2263
+ - state.left_arm
2264
+ - state.right_arm
2265
+ - state.rpy
2266
+ - state.height
2267
+ normalization_modes:
2268
+ state.left_hand: q99
2269
+ state.right_hand: q99
2270
+ state.left_arm: q99
2271
+ state.right_arm: q99
2272
+ state.rpy: q99
2273
+ state.height: q99
2274
+ - _target_: groot.vla.data.transform.StateActionToTensor
2275
+ apply_to:
2276
+ - action.left_hand
2277
+ - action.right_hand
2278
+ - action.left_arm
2279
+ - action.right_arm
2280
+ - action.rpy
2281
+ - action.height
2282
+ - action.torso_vx
2283
+ - action.torso_vy
2284
+ - action.torso_vyaw
2285
+ - action.torso_dyaw
2286
+ - _target_: groot.vla.data.transform.StateActionTransform
2287
+ apply_to:
2288
+ - action.left_hand
2289
+ - action.right_hand
2290
+ - action.left_arm
2291
+ - action.right_arm
2292
+ - action.rpy
2293
+ - action.height
2294
+ - action.torso_vx
2295
+ - action.torso_vy
2296
+ - action.torso_vyaw
2297
+ - action.torso_dyaw
2298
+ normalization_modes:
2299
+ action.left_hand: q99
2300
+ action.right_hand: q99
2301
+ action.left_arm: q99
2302
+ action.right_arm: q99
2303
+ action.rpy: q99
2304
+ action.height: q99
2305
+ action.torso_vx: q99
2306
+ action.torso_vy: q99
2307
+ action.torso_vyaw: q99
2308
+ action.torso_dyaw: q99
2309
+ - _target_: groot.vla.data.transform.ConcatTransform
2310
+ video_concat_order:
2311
+ - video.egocentric
2312
+ state_concat_order:
2313
+ - state.left_hand
2314
+ - state.right_hand
2315
+ - state.left_arm
2316
+ - state.right_arm
2317
+ - state.rpy
2318
+ - state.height
2319
+ action_concat_order:
2320
+ - action.left_hand
2321
+ - action.right_hand
2322
+ - action.left_arm
2323
+ - action.right_arm
2324
+ - action.rpy
2325
+ - action.height
2326
+ - action.torso_vx
2327
+ - action.torso_vy
2328
+ - action.torso_vyaw
2329
+ - action.torso_dyaw
2330
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
2331
+ default_instruction: Perform the default behavior.
2332
+ language_dropout_prob: 0.0
2333
+ always_use_default_instruction: false
2334
+ max_state_dim: 64
2335
+ max_action_dim: 36
2336
+ max_length: 512
2337
+ state_horizon: 1
2338
+ action_horizon: 24
2339
+ embodiment_tag_mapping:
2340
+ real_gr1_arms_only: 0
2341
+ real_gr1_arms_only_annotated: 1
2342
+ real_gr1_arms_waist: 2
2343
+ real_gr1_arms_waist_annotated: 3
2344
+ dexmg_gr1_arms_only_inspire: 4
2345
+ dexmg_gr1_arms_only_fourier: 5
2346
+ dexmg_gr1_arms_waist_fourier: 6
2347
+ robocasa_single_arm: 7
2348
+ onex_eve_gripper: 8
2349
+ robocasa_gr1_arms_only_inspire_hands: 9
2350
+ robocasa_gr1_arms_only_fourier_hands: 10
2351
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
2352
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
2353
+ robocasa_panda_omron: 13
2354
+ robocasa_bimanual_panda_parallel_gripper: 15
2355
+ robocasa_bimanual_panda_inspire_hand: 16
2356
+ oxe_droid: 17
2357
+ oxe_fractal: 18
2358
+ oxe_language_table: 19
2359
+ oxe_bridge: 20
2360
+ real_panda_single_arm: 21
2361
+ hot3d_hands_only: 23
2362
+ gr1_unified: 24
2363
+ robocasa_gr1_arms_waist_fourier_hands: 25
2364
+ agibot: 26
2365
+ lapa: 27
2366
+ oxe_mutex: 28
2367
+ oxe_roboset: 29
2368
+ oxe_plex: 30
2369
+ dream: 31
2370
+ yam: 32
2371
+ xdof: 22
2372
+ gr1_unified_segmentation: 14
2373
+ language_table_sim: 7
2374
+ gr1_isaac: 0
2375
+ sim_behavior_r1_pro: 31
2376
+ mecka_hands: 27
2377
+ real_r1_pro_sharpa: 28
2378
+ real_teleop_g1: 33
2379
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
2380
+ modality_configs:
2381
+ oxe_droid:
2382
+ video:
2383
+ _target_: groot.vla.data.dataset.ModalityConfig
2384
+ delta_indices:
2385
+ - 0
2386
+ - 1
2387
+ - 2
2388
+ - 3
2389
+ - 4
2390
+ - 5
2391
+ - 6
2392
+ - 7
2393
+ - 8
2394
+ - 9
2395
+ - 10
2396
+ - 11
2397
+ - 12
2398
+ - 13
2399
+ - 14
2400
+ - 15
2401
+ - 16
2402
+ - 17
2403
+ - 18
2404
+ - 19
2405
+ - 20
2406
+ - 21
2407
+ - 22
2408
+ - 23
2409
+ - 24
2410
+ eval_delta_indices:
2411
+ - 0
2412
+ modality_keys:
2413
+ - video.exterior_image_1_left
2414
+ - video.exterior_image_2_left
2415
+ - video.wrist_image_left
2416
+ state:
2417
+ _target_: groot.vla.data.dataset.ModalityConfig
2418
+ delta_indices:
2419
+ - 0
2420
+ modality_keys:
2421
+ - state.joint_position
2422
+ - state.gripper_position
2423
+ action:
2424
+ _target_: groot.vla.data.dataset.ModalityConfig
2425
+ delta_indices:
2426
+ - 0
2427
+ - 1
2428
+ - 2
2429
+ - 3
2430
+ - 4
2431
+ - 5
2432
+ - 6
2433
+ - 7
2434
+ - 8
2435
+ - 9
2436
+ - 10
2437
+ - 11
2438
+ - 12
2439
+ - 13
2440
+ - 14
2441
+ - 15
2442
+ - 16
2443
+ - 17
2444
+ - 18
2445
+ - 19
2446
+ - 20
2447
+ - 21
2448
+ - 22
2449
+ - 23
2450
+ modality_keys:
2451
+ - action.joint_position
2452
+ - action.gripper_position
2453
+ language:
2454
+ _target_: groot.vla.data.dataset.ModalityConfig
2455
+ delta_indices:
2456
+ - 0
2457
+ modality_keys:
2458
+ - annotation.language.language_instruction
2459
+ - annotation.language.language_instruction_2
2460
+ - annotation.language.language_instruction_3
2461
+ lapa_action:
2462
+ _target_: groot.vla.data.dataset.ModalityConfig
2463
+ delta_indices:
2464
+ - 0
2465
+ modality_keys:
2466
+ - lapa_action
2467
+ agibot:
2468
+ video:
2469
+ _target_: groot.vla.data.dataset.ModalityConfig
2470
+ delta_indices:
2471
+ - 0
2472
+ - 1
2473
+ - 2
2474
+ - 3
2475
+ - 4
2476
+ - 5
2477
+ - 6
2478
+ - 7
2479
+ - 8
2480
+ - 9
2481
+ - 10
2482
+ - 11
2483
+ - 12
2484
+ - 13
2485
+ - 14
2486
+ - 15
2487
+ - 16
2488
+ - 17
2489
+ - 18
2490
+ - 19
2491
+ - 20
2492
+ - 21
2493
+ - 22
2494
+ - 23
2495
+ - 24
2496
+ eval_delta_indices:
2497
+ - -3
2498
+ - -2
2499
+ - -1
2500
+ - 0
2501
+ modality_keys:
2502
+ - video.top_head
2503
+ - video.hand_left
2504
+ - video.hand_right
2505
+ state:
2506
+ _target_: groot.vla.data.dataset.ModalityConfig
2507
+ delta_indices:
2508
+ - 0
2509
+ modality_keys:
2510
+ - state.left_arm_joint_position
2511
+ - state.right_arm_joint_position
2512
+ - state.left_effector_position
2513
+ - state.right_effector_position
2514
+ - state.head_position
2515
+ - state.waist_position
2516
+ action:
2517
+ _target_: groot.vla.data.dataset.ModalityConfig
2518
+ delta_indices:
2519
+ - 0
2520
+ - 1
2521
+ - 2
2522
+ - 3
2523
+ - 4
2524
+ - 5
2525
+ - 6
2526
+ - 7
2527
+ - 8
2528
+ - 9
2529
+ - 10
2530
+ - 11
2531
+ - 12
2532
+ - 13
2533
+ - 14
2534
+ - 15
2535
+ - 16
2536
+ - 17
2537
+ - 18
2538
+ - 19
2539
+ - 20
2540
+ - 21
2541
+ - 22
2542
+ - 23
2543
+ modality_keys:
2544
+ - action.left_arm_joint_position
2545
+ - action.right_arm_joint_position
2546
+ - action.left_effector_position
2547
+ - action.right_effector_position
2548
+ - action.head_position
2549
+ - action.waist_position
2550
+ - action.robot_velocity
2551
+ language:
2552
+ _target_: groot.vla.data.dataset.ModalityConfig
2553
+ delta_indices:
2554
+ - 0
2555
+ modality_keys:
2556
+ - annotation.language.action_text
2557
+ yam:
2558
+ video:
2559
+ _target_: groot.vla.data.dataset.ModalityConfig
2560
+ delta_indices:
2561
+ - 0
2562
+ - 1
2563
+ - 2
2564
+ - 3
2565
+ - 4
2566
+ - 5
2567
+ - 6
2568
+ - 7
2569
+ - 8
2570
+ - 9
2571
+ - 10
2572
+ - 11
2573
+ - 12
2574
+ - 13
2575
+ - 14
2576
+ - 15
2577
+ - 16
2578
+ - 17
2579
+ - 18
2580
+ - 19
2581
+ - 20
2582
+ - 21
2583
+ - 22
2584
+ - 23
2585
+ - 24
2586
+ eval_delta_indices:
2587
+ - 0
2588
+ modality_keys:
2589
+ - video.top_camera-images-rgb
2590
+ - video.left_camera-images-rgb
2591
+ - video.right_camera-images-rgb
2592
+ state:
2593
+ _target_: groot.vla.data.dataset.ModalityConfig
2594
+ delta_indices:
2595
+ - 0
2596
+ modality_keys:
2597
+ - state.left_joint_pos
2598
+ - state.left_gripper_pos
2599
+ - state.right_joint_pos
2600
+ - state.right_gripper_pos
2601
+ action:
2602
+ _target_: groot.vla.data.dataset.ModalityConfig
2603
+ delta_indices:
2604
+ - 0
2605
+ - 1
2606
+ - 2
2607
+ - 3
2608
+ - 4
2609
+ - 5
2610
+ - 6
2611
+ - 7
2612
+ - 8
2613
+ - 9
2614
+ - 10
2615
+ - 11
2616
+ - 12
2617
+ - 13
2618
+ - 14
2619
+ - 15
2620
+ - 16
2621
+ - 17
2622
+ - 18
2623
+ - 19
2624
+ - 20
2625
+ - 21
2626
+ - 22
2627
+ - 23
2628
+ modality_keys:
2629
+ - action.left_joint_pos
2630
+ - action.left_gripper_pos
2631
+ - action.right_joint_pos
2632
+ - action.right_gripper_pos
2633
+ language:
2634
+ _target_: groot.vla.data.dataset.ModalityConfig
2635
+ delta_indices:
2636
+ - 0
2637
+ modality_keys:
2638
+ - annotation.task
2639
+ real_teleop_g1:
2640
+ video:
2641
+ _target_: groot.vla.data.dataset.ModalityConfig
2642
+ delta_indices:
2643
+ - 0
2644
+ - 1
2645
+ - 2
2646
+ - 3
2647
+ - 4
2648
+ - 5
2649
+ - 6
2650
+ - 7
2651
+ - 8
2652
+ - 9
2653
+ - 10
2654
+ - 11
2655
+ - 12
2656
+ - 13
2657
+ - 14
2658
+ - 15
2659
+ - 16
2660
+ - 17
2661
+ - 18
2662
+ - 19
2663
+ - 20
2664
+ - 21
2665
+ - 22
2666
+ - 23
2667
+ - 24
2668
+ eval_delta_indices:
2669
+ - 0
2670
+ modality_keys:
2671
+ - video.egocentric
2672
+ state:
2673
+ _target_: groot.vla.data.dataset.ModalityConfig
2674
+ delta_indices:
2675
+ - 0
2676
+ modality_keys:
2677
+ - state.left_hand
2678
+ - state.right_hand
2679
+ - state.left_arm
2680
+ - state.right_arm
2681
+ - state.rpy
2682
+ - state.height
2683
+ action:
2684
+ _target_: groot.vla.data.dataset.ModalityConfig
2685
+ delta_indices:
2686
+ - 0
2687
+ - 1
2688
+ - 2
2689
+ - 3
2690
+ - 4
2691
+ - 5
2692
+ - 6
2693
+ - 7
2694
+ - 8
2695
+ - 9
2696
+ - 10
2697
+ - 11
2698
+ - 12
2699
+ - 13
2700
+ - 14
2701
+ - 15
2702
+ - 16
2703
+ - 17
2704
+ - 18
2705
+ - 19
2706
+ - 20
2707
+ - 21
2708
+ - 22
2709
+ - 23
2710
+ modality_keys:
2711
+ - action.left_hand
2712
+ - action.right_hand
2713
+ - action.left_arm
2714
+ - action.right_arm
2715
+ - action.rpy
2716
+ - action.height
2717
+ - action.torso_vx
2718
+ - action.torso_vy
2719
+ - action.torso_vyaw
2720
+ - action.torso_dyaw
2721
+ language:
2722
+ _target_: groot.vla.data.dataset.ModalityConfig
2723
+ delta_indices:
2724
+ - 0
2725
+ modality_keys:
2726
+ - annotation.language.language_instruction
2727
+ transforms:
2728
+ oxe_droid:
2729
+ _target_: groot.vla.data.transform.ComposedModalityTransform
2730
+ transforms:
2731
+ - _target_: groot.vla.data.transform.VideoToTensor
2732
+ apply_to:
2733
+ - video.exterior_image_1_left
2734
+ - video.exterior_image_2_left
2735
+ - video.wrist_image_left
2736
+ - _target_: groot.vla.data.transform.VideoCrop
2737
+ apply_to:
2738
+ - video.exterior_image_1_left
2739
+ - video.exterior_image_2_left
2740
+ - video.wrist_image_left
2741
+ scale: 0.95
2742
+ mode: random
2743
+ - _target_: groot.vla.data.transform.VideoResize
2744
+ apply_to:
2745
+ - video.exterior_image_1_left
2746
+ - video.exterior_image_2_left
2747
+ - video.wrist_image_left
2748
+ height: 176
2749
+ width: 320
2750
+ interpolation: linear
2751
+ - _target_: groot.vla.data.transform.VideoColorJitter
2752
+ apply_to:
2753
+ - video.exterior_image_1_left
2754
+ - video.exterior_image_2_left
2755
+ - video.wrist_image_left
2756
+ brightness: 0.3
2757
+ contrast: 0.4
2758
+ saturation: 0.5
2759
+ hue: 0.08
2760
+ - _target_: groot.vla.data.transform.VideoToNumpy
2761
+ apply_to:
2762
+ - video.exterior_image_1_left
2763
+ - video.exterior_image_2_left
2764
+ - video.wrist_image_left
2765
+ - _target_: groot.vla.data.transform.StateActionToTensor
2766
+ apply_to:
2767
+ - state.joint_position
2768
+ - state.gripper_position
2769
+ - _target_: groot.vla.data.transform.StateActionTransform
2770
+ apply_to:
2771
+ - state.joint_position
2772
+ - state.gripper_position
2773
+ normalization_modes:
2774
+ state.joint_position: q99
2775
+ state.gripper_position: q99
2776
+ - _target_: groot.vla.data.transform.StateActionToTensor
2777
+ apply_to:
2778
+ - action.joint_position
2779
+ - action.gripper_position
2780
+ - _target_: groot.vla.data.transform.StateActionTransform
2781
+ apply_to:
2782
+ - action.joint_position
2783
+ - action.gripper_position
2784
+ normalization_modes:
2785
+ action.joint_position: q99
2786
+ action.gripper_position: q99
2787
+ - _target_: groot.vla.data.transform.ConcatTransform
2788
+ video_concat_order:
2789
+ - video.exterior_image_1_left
2790
+ - video.exterior_image_2_left
2791
+ - video.wrist_image_left
2792
+ state_concat_order:
2793
+ - state.joint_position
2794
+ - state.gripper_position
2795
+ action_concat_order:
2796
+ - action.joint_position
2797
+ - action.gripper_position
2798
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
2799
+ default_instruction: Perform the default behavior.
2800
+ language_dropout_prob: 0.0
2801
+ always_use_default_instruction: false
2802
+ max_state_dim: 64
2803
+ max_action_dim: 36
2804
+ max_length: 512
2805
+ state_horizon: 1
2806
+ action_horizon: 24
2807
+ embodiment_tag_mapping:
2808
+ real_gr1_arms_only: 0
2809
+ real_gr1_arms_only_annotated: 1
2810
+ real_gr1_arms_waist: 2
2811
+ real_gr1_arms_waist_annotated: 3
2812
+ dexmg_gr1_arms_only_inspire: 4
2813
+ dexmg_gr1_arms_only_fourier: 5
2814
+ dexmg_gr1_arms_waist_fourier: 6
2815
+ robocasa_single_arm: 7
2816
+ onex_eve_gripper: 8
2817
+ robocasa_gr1_arms_only_inspire_hands: 9
2818
+ robocasa_gr1_arms_only_fourier_hands: 10
2819
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
2820
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
2821
+ robocasa_panda_omron: 13
2822
+ robocasa_bimanual_panda_parallel_gripper: 15
2823
+ robocasa_bimanual_panda_inspire_hand: 16
2824
+ oxe_droid: 17
2825
+ oxe_fractal: 18
2826
+ oxe_language_table: 19
2827
+ oxe_bridge: 20
2828
+ real_panda_single_arm: 21
2829
+ hot3d_hands_only: 23
2830
+ gr1_unified: 24
2831
+ robocasa_gr1_arms_waist_fourier_hands: 25
2832
+ agibot: 26
2833
+ lapa: 27
2834
+ oxe_mutex: 28
2835
+ oxe_roboset: 29
2836
+ oxe_plex: 30
2837
+ dream: 31
2838
+ yam: 32
2839
+ xdof: 22
2840
+ gr1_unified_segmentation: 14
2841
+ language_table_sim: 7
2842
+ gr1_isaac: 0
2843
+ sim_behavior_r1_pro: 31
2844
+ mecka_hands: 27
2845
+ real_r1_pro_sharpa: 28
2846
+ real_teleop_g1: 33
2847
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
2848
+ agibot:
2849
+ _target_: groot.vla.data.transform.ComposedModalityTransform
2850
+ transforms:
2851
+ - _target_: groot.vla.data.transform.VideoToTensor
2852
+ apply_to:
2853
+ - video.top_head
2854
+ - video.hand_left
2855
+ - video.hand_right
2856
+ - _target_: groot.vla.data.transform.VideoCrop
2857
+ apply_to:
2858
+ - video.top_head
2859
+ - video.hand_left
2860
+ - video.hand_right
2861
+ scale: 0.95
2862
+ mode: random
2863
+ - _target_: groot.vla.data.transform.VideoResize
2864
+ apply_to:
2865
+ - video.top_head
2866
+ - video.hand_left
2867
+ - video.hand_right
2868
+ height: 176
2869
+ width: 320
2870
+ interpolation: linear
2871
+ - _target_: groot.vla.data.transform.VideoColorJitter
2872
+ apply_to:
2873
+ - video.top_head
2874
+ - video.hand_left
2875
+ - video.hand_right
2876
+ brightness: 0.3
2877
+ contrast: 0.4
2878
+ saturation: 0.5
2879
+ hue: 0.08
2880
+ - _target_: groot.vla.data.transform.VideoToNumpy
2881
+ apply_to:
2882
+ - video.top_head
2883
+ - video.hand_left
2884
+ - video.hand_right
2885
+ - _target_: groot.vla.data.transform.StateActionToTensor
2886
+ apply_to:
2887
+ - state.left_arm_joint_position
2888
+ - state.right_arm_joint_position
2889
+ - state.left_effector_position
2890
+ - state.right_effector_position
2891
+ - state.head_position
2892
+ - state.waist_position
2893
+ - _target_: groot.vla.data.transform.StateActionTransform
2894
+ apply_to:
2895
+ - state.left_arm_joint_position
2896
+ - state.right_arm_joint_position
2897
+ - state.left_effector_position
2898
+ - state.right_effector_position
2899
+ - state.head_position
2900
+ - state.waist_position
2901
+ normalization_modes:
2902
+ state.left_arm_joint_position: q99
2903
+ state.right_arm_joint_position: q99
2904
+ state.left_effector_position: q99
2905
+ state.right_effector_position: q99
2906
+ state.head_position: q99
2907
+ state.waist_position: q99
2908
+ - _target_: groot.vla.data.transform.StateActionToTensor
2909
+ apply_to:
2910
+ - action.left_arm_joint_position
2911
+ - action.right_arm_joint_position
2912
+ - action.left_effector_position
2913
+ - action.right_effector_position
2914
+ - action.head_position
2915
+ - action.waist_position
2916
+ - action.robot_velocity
2917
+ - _target_: groot.vla.data.transform.StateActionTransform
2918
+ apply_to:
2919
+ - action.left_arm_joint_position
2920
+ - action.right_arm_joint_position
2921
+ - action.left_effector_position
2922
+ - action.right_effector_position
2923
+ - action.head_position
2924
+ - action.waist_position
2925
+ - action.robot_velocity
2926
+ normalization_modes:
2927
+ action.left_arm_joint_position: q99
2928
+ action.right_arm_joint_position: q99
2929
+ action.left_effector_position: q99
2930
+ action.right_effector_position: q99
2931
+ action.head_position: q99
2932
+ action.waist_position: q99
2933
+ action.robot_velocity: q99
2934
+ - _target_: groot.vla.data.transform.ConcatTransform
2935
+ video_concat_order:
2936
+ - video.top_head
2937
+ - video.hand_left
2938
+ - video.hand_right
2939
+ state_concat_order:
2940
+ - state.left_arm_joint_position
2941
+ - state.right_arm_joint_position
2942
+ - state.left_effector_position
2943
+ - state.right_effector_position
2944
+ - state.head_position
2945
+ - state.waist_position
2946
+ action_concat_order:
2947
+ - action.left_arm_joint_position
2948
+ - action.right_arm_joint_position
2949
+ - action.left_effector_position
2950
+ - action.right_effector_position
2951
+ - action.head_position
2952
+ - action.waist_position
2953
+ - action.robot_velocity
2954
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
2955
+ default_instruction: Perform the default behavior.
2956
+ language_dropout_prob: 0.0
2957
+ always_use_default_instruction: false
2958
+ max_state_dim: 64
2959
+ max_action_dim: 36
2960
+ max_length: 512
2961
+ state_horizon: 1
2962
+ action_horizon: 24
2963
+ embodiment_tag_mapping:
2964
+ real_gr1_arms_only: 0
2965
+ real_gr1_arms_only_annotated: 1
2966
+ real_gr1_arms_waist: 2
2967
+ real_gr1_arms_waist_annotated: 3
2968
+ dexmg_gr1_arms_only_inspire: 4
2969
+ dexmg_gr1_arms_only_fourier: 5
2970
+ dexmg_gr1_arms_waist_fourier: 6
2971
+ robocasa_single_arm: 7
2972
+ onex_eve_gripper: 8
2973
+ robocasa_gr1_arms_only_inspire_hands: 9
2974
+ robocasa_gr1_arms_only_fourier_hands: 10
2975
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
2976
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
2977
+ robocasa_panda_omron: 13
2978
+ robocasa_bimanual_panda_parallel_gripper: 15
2979
+ robocasa_bimanual_panda_inspire_hand: 16
2980
+ oxe_droid: 17
2981
+ oxe_fractal: 18
2982
+ oxe_language_table: 19
2983
+ oxe_bridge: 20
2984
+ real_panda_single_arm: 21
2985
+ hot3d_hands_only: 23
2986
+ gr1_unified: 24
2987
+ robocasa_gr1_arms_waist_fourier_hands: 25
2988
+ agibot: 26
2989
+ lapa: 27
2990
+ oxe_mutex: 28
2991
+ oxe_roboset: 29
2992
+ oxe_plex: 30
2993
+ dream: 31
2994
+ yam: 32
2995
+ xdof: 22
2996
+ gr1_unified_segmentation: 14
2997
+ language_table_sim: 7
2998
+ gr1_isaac: 0
2999
+ sim_behavior_r1_pro: 31
3000
+ mecka_hands: 27
3001
+ real_r1_pro_sharpa: 28
3002
+ real_teleop_g1: 33
3003
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
3004
+ yam:
3005
+ _target_: groot.vla.data.transform.ComposedModalityTransform
3006
+ transforms:
3007
+ - _target_: groot.vla.data.transform.VideoToTensor
3008
+ apply_to:
3009
+ - video.top_camera-images-rgb
3010
+ - video.left_camera-images-rgb
3011
+ - video.right_camera-images-rgb
3012
+ - _target_: groot.vla.data.transform.VideoCrop
3013
+ apply_to:
3014
+ - video.top_camera-images-rgb
3015
+ - video.left_camera-images-rgb
3016
+ - video.right_camera-images-rgb
3017
+ scale: 0.95
3018
+ mode: random
3019
+ - _target_: groot.vla.data.transform.VideoResize
3020
+ apply_to:
3021
+ - video.top_camera-images-rgb
3022
+ - video.left_camera-images-rgb
3023
+ - video.right_camera-images-rgb
3024
+ height: 176
3025
+ width: 320
3026
+ interpolation: linear
3027
+ - _target_: groot.vla.data.transform.VideoColorJitter
3028
+ apply_to:
3029
+ - video.top_camera-images-rgb
3030
+ - video.left_camera-images-rgb
3031
+ - video.right_camera-images-rgb
3032
+ brightness: 0.3
3033
+ contrast: 0.4
3034
+ saturation: 0.5
3035
+ hue: 0.08
3036
+ - _target_: groot.vla.data.transform.VideoToNumpy
3037
+ apply_to:
3038
+ - video.top_camera-images-rgb
3039
+ - video.left_camera-images-rgb
3040
+ - video.right_camera-images-rgb
3041
+ - _target_: groot.vla.data.transform.StateActionToTensor
3042
+ apply_to:
3043
+ - state.left_joint_pos
3044
+ - state.left_gripper_pos
3045
+ - state.right_joint_pos
3046
+ - state.right_gripper_pos
3047
+ - _target_: groot.vla.data.transform.StateActionTransform
3048
+ apply_to:
3049
+ - state.left_joint_pos
3050
+ - state.left_gripper_pos
3051
+ - state.right_joint_pos
3052
+ - state.right_gripper_pos
3053
+ normalization_modes:
3054
+ state.left_joint_pos: q99
3055
+ state.left_gripper_pos: q99
3056
+ state.right_joint_pos: q99
3057
+ state.right_gripper_pos: q99
3058
+ - _target_: groot.vla.data.transform.StateActionToTensor
3059
+ apply_to:
3060
+ - action.left_joint_pos
3061
+ - action.left_gripper_pos
3062
+ - action.right_joint_pos
3063
+ - action.right_gripper_pos
3064
+ - _target_: groot.vla.data.transform.StateActionTransform
3065
+ apply_to:
3066
+ - action.left_joint_pos
3067
+ - action.left_gripper_pos
3068
+ - action.right_joint_pos
3069
+ - action.right_gripper_pos
3070
+ normalization_modes:
3071
+ action.left_joint_pos: q99
3072
+ action.left_gripper_pos: q99
3073
+ action.right_joint_pos: q99
3074
+ action.right_gripper_pos: q99
3075
+ - _target_: groot.vla.data.transform.ConcatTransform
3076
+ video_concat_order:
3077
+ - video.top_camera-images-rgb
3078
+ - video.left_camera-images-rgb
3079
+ - video.right_camera-images-rgb
3080
+ state_concat_order:
3081
+ - state.left_joint_pos
3082
+ - state.left_gripper_pos
3083
+ - state.right_joint_pos
3084
+ - state.right_gripper_pos
3085
+ action_concat_order:
3086
+ - action.left_joint_pos
3087
+ - action.left_gripper_pos
3088
+ - action.right_joint_pos
3089
+ - action.right_gripper_pos
3090
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
3091
+ default_instruction: Perform the default behavior.
3092
+ language_dropout_prob: 0.0
3093
+ always_use_default_instruction: false
3094
+ max_state_dim: 64
3095
+ max_action_dim: 36
3096
+ max_length: 512
3097
+ state_horizon: 1
3098
+ action_horizon: 24
3099
+ embodiment_tag_mapping:
3100
+ real_gr1_arms_only: 0
3101
+ real_gr1_arms_only_annotated: 1
3102
+ real_gr1_arms_waist: 2
3103
+ real_gr1_arms_waist_annotated: 3
3104
+ dexmg_gr1_arms_only_inspire: 4
3105
+ dexmg_gr1_arms_only_fourier: 5
3106
+ dexmg_gr1_arms_waist_fourier: 6
3107
+ robocasa_single_arm: 7
3108
+ onex_eve_gripper: 8
3109
+ robocasa_gr1_arms_only_inspire_hands: 9
3110
+ robocasa_gr1_arms_only_fourier_hands: 10
3111
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
3112
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
3113
+ robocasa_panda_omron: 13
3114
+ robocasa_bimanual_panda_parallel_gripper: 15
3115
+ robocasa_bimanual_panda_inspire_hand: 16
3116
+ oxe_droid: 17
3117
+ oxe_fractal: 18
3118
+ oxe_language_table: 19
3119
+ oxe_bridge: 20
3120
+ real_panda_single_arm: 21
3121
+ hot3d_hands_only: 23
3122
+ gr1_unified: 24
3123
+ robocasa_gr1_arms_waist_fourier_hands: 25
3124
+ agibot: 26
3125
+ lapa: 27
3126
+ oxe_mutex: 28
3127
+ oxe_roboset: 29
3128
+ oxe_plex: 30
3129
+ dream: 31
3130
+ yam: 32
3131
+ xdof: 22
3132
+ gr1_unified_segmentation: 14
3133
+ language_table_sim: 7
3134
+ gr1_isaac: 0
3135
+ sim_behavior_r1_pro: 31
3136
+ mecka_hands: 27
3137
+ real_r1_pro_sharpa: 28
3138
+ real_teleop_g1: 33
3139
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
3140
+ real_teleop_g1:
3141
+ _target_: groot.vla.data.transform.ComposedModalityTransform
3142
+ transforms:
3143
+ - _target_: groot.vla.data.transform.VideoToTensor
3144
+ apply_to:
3145
+ - video.egocentric
3146
+ - _target_: groot.vla.data.transform.VideoCrop
3147
+ apply_to:
3148
+ - video.egocentric
3149
+ scale: 0.95
3150
+ mode: random
3151
+ - _target_: groot.vla.data.transform.VideoResize
3152
+ apply_to:
3153
+ - video.egocentric
3154
+ height: 176
3155
+ width: 320
3156
+ interpolation: linear
3157
+ - _target_: groot.vla.data.transform.VideoColorJitter
3158
+ apply_to:
3159
+ - video.egocentric
3160
+ brightness: 0.3
3161
+ contrast: 0.4
3162
+ saturation: 0.5
3163
+ hue: 0.08
3164
+ - _target_: groot.vla.data.transform.VideoToNumpy
3165
+ apply_to:
3166
+ - video.egocentric
3167
+ - _target_: groot.vla.data.transform.StateActionToTensor
3168
+ apply_to:
3169
+ - state.left_hand
3170
+ - state.right_hand
3171
+ - state.left_arm
3172
+ - state.right_arm
3173
+ - state.rpy
3174
+ - state.height
3175
+ - _target_: groot.vla.data.transform.StateActionTransform
3176
+ apply_to:
3177
+ - state.left_hand
3178
+ - state.right_hand
3179
+ - state.left_arm
3180
+ - state.right_arm
3181
+ - state.rpy
3182
+ - state.height
3183
+ normalization_modes:
3184
+ state.left_hand: q99
3185
+ state.right_hand: q99
3186
+ state.left_arm: q99
3187
+ state.right_arm: q99
3188
+ state.rpy: q99
3189
+ state.height: q99
3190
+ - _target_: groot.vla.data.transform.StateActionToTensor
3191
+ apply_to:
3192
+ - action.left_hand
3193
+ - action.right_hand
3194
+ - action.left_arm
3195
+ - action.right_arm
3196
+ - action.rpy
3197
+ - action.height
3198
+ - action.torso_vx
3199
+ - action.torso_vy
3200
+ - action.torso_vyaw
3201
+ - action.torso_dyaw
3202
+ - _target_: groot.vla.data.transform.StateActionTransform
3203
+ apply_to:
3204
+ - action.left_hand
3205
+ - action.right_hand
3206
+ - action.left_arm
3207
+ - action.right_arm
3208
+ - action.rpy
3209
+ - action.height
3210
+ - action.torso_vx
3211
+ - action.torso_vy
3212
+ - action.torso_vyaw
3213
+ - action.torso_dyaw
3214
+ normalization_modes:
3215
+ action.left_hand: q99
3216
+ action.right_hand: q99
3217
+ action.left_arm: q99
3218
+ action.right_arm: q99
3219
+ action.rpy: q99
3220
+ action.height: q99
3221
+ action.torso_vx: q99
3222
+ action.torso_vy: q99
3223
+ action.torso_vyaw: q99
3224
+ action.torso_dyaw: q99
3225
+ - _target_: groot.vla.data.transform.ConcatTransform
3226
+ video_concat_order:
3227
+ - video.egocentric
3228
+ state_concat_order:
3229
+ - state.left_hand
3230
+ - state.right_hand
3231
+ - state.left_arm
3232
+ - state.right_arm
3233
+ - state.rpy
3234
+ - state.height
3235
+ action_concat_order:
3236
+ - action.left_hand
3237
+ - action.right_hand
3238
+ - action.left_arm
3239
+ - action.right_arm
3240
+ - action.rpy
3241
+ - action.height
3242
+ - action.torso_vx
3243
+ - action.torso_vy
3244
+ - action.torso_vyaw
3245
+ - action.torso_dyaw
3246
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
3247
+ default_instruction: Perform the default behavior.
3248
+ language_dropout_prob: 0.0
3249
+ always_use_default_instruction: false
3250
+ max_state_dim: 64
3251
+ max_action_dim: 36
3252
+ max_length: 512
3253
+ state_horizon: 1
3254
+ action_horizon: 24
3255
+ embodiment_tag_mapping:
3256
+ real_gr1_arms_only: 0
3257
+ real_gr1_arms_only_annotated: 1
3258
+ real_gr1_arms_waist: 2
3259
+ real_gr1_arms_waist_annotated: 3
3260
+ dexmg_gr1_arms_only_inspire: 4
3261
+ dexmg_gr1_arms_only_fourier: 5
3262
+ dexmg_gr1_arms_waist_fourier: 6
3263
+ robocasa_single_arm: 7
3264
+ onex_eve_gripper: 8
3265
+ robocasa_gr1_arms_only_inspire_hands: 9
3266
+ robocasa_gr1_arms_only_fourier_hands: 10
3267
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
3268
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
3269
+ robocasa_panda_omron: 13
3270
+ robocasa_bimanual_panda_parallel_gripper: 15
3271
+ robocasa_bimanual_panda_inspire_hand: 16
3272
+ oxe_droid: 17
3273
+ oxe_fractal: 18
3274
+ oxe_language_table: 19
3275
+ oxe_bridge: 20
3276
+ real_panda_single_arm: 21
3277
+ hot3d_hands_only: 23
3278
+ gr1_unified: 24
3279
+ robocasa_gr1_arms_waist_fourier_hands: 25
3280
+ agibot: 26
3281
+ lapa: 27
3282
+ oxe_mutex: 28
3283
+ oxe_roboset: 29
3284
+ oxe_plex: 30
3285
+ dream: 31
3286
+ yam: 32
3287
+ xdof: 22
3288
+ gr1_unified_segmentation: 14
3289
+ language_table_sim: 7
3290
+ gr1_isaac: 0
3291
+ sim_behavior_r1_pro: 31
3292
+ mecka_hands: 27
3293
+ real_r1_pro_sharpa: 28
3294
+ real_teleop_g1: 33
3295
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
3296
+ metadata_versions:
3297
+ oxe_droid: '0221'
3298
+ agibot: '0221'
3299
+ yam: '0221'
3300
+ real_teleop_g1: '0221'
3301
+ fps:
3302
+ yam: 30
3303
+ real_teleop_g1: 30
3304
+ relative_action: true
3305
+ relative_action_per_horizon: false
3306
+ relative_action_keys:
3307
+ - left_hand
3308
+ - right_hand
3309
+ - left_arm
3310
+ - right_arm
3311
+ - rpy
3312
+ - height
3313
+ max_chunk_size: 4
3314
+ dataset_shard_sampling_rate: 0.1
3315
+ mixture_dataset_cls: groot.vla.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec
3316
+ single_dataset_cls: groot.vla.data.dataset.lerobot_sharded.ShardedLeRobotSubLangSingleActionChunkDatasetDROID
3317
+ real_teleop_g1_data_root: /hfm/boqian/liboqian_data/data/real_data/gear/g1/Pick_bottle_and_turn_and_pour_into_cup
3318
+ total_training_steps: 1048576000000
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/experiment_cfg/metadata.json ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "real_teleop_g1": {
3
+ "statistics": {
4
+ "state": {
5
+ "left_hand": {
6
+ "max": [
7
+ -0.15012885630130768,
8
+ 0.9005824327468872,
9
+ 1.240552306175232,
10
+ -0.001318894443102181,
11
+ -0.029013408347964287,
12
+ -0.0026860591024160385,
13
+ -0.032535798847675323
14
+ ],
15
+ "min": [
16
+ -1.0365108251571655,
17
+ 0.3635019063949585,
18
+ 0.6077943444252014,
19
+ -0.8983967900276184,
20
+ -1.4610702991485596,
21
+ -0.9210805296897888,
22
+ -1.2054297924041748
23
+ ],
24
+ "mean": [
25
+ -0.7106521057783574,
26
+ 0.7397562400548329,
27
+ 0.7932539759105839,
28
+ -0.18307749281402536,
29
+ -0.3996594926481087,
30
+ -0.22440346846797105,
31
+ -0.3648747590553514
32
+ ],
33
+ "std": [
34
+ 0.1800224325521231,
35
+ 0.08892492705684825,
36
+ 0.17347929747061464,
37
+ 0.1250548816433564,
38
+ 0.14322181312600368,
39
+ 0.12746796614686143,
40
+ 0.1004095686025728
41
+ ],
42
+ "q01": [
43
+ -1.0010048174858093,
44
+ 0.5051653385162354,
45
+ 0.6091835498809814,
46
+ -0.5192373794317245,
47
+ -0.7626793467998505,
48
+ -0.5990579128265381,
49
+ -0.6296854817867279
50
+ ],
51
+ "q99": [
52
+ -0.35215023159980774,
53
+ 0.8828813862800599,
54
+ 1.1794202327728271,
55
+ -0.012645758632570512,
56
+ -0.08962718859314919,
57
+ -0.01236837636679411,
58
+ -0.14306417107582092
59
+ ]
60
+ },
61
+ "right_hand": {
62
+ "max": [
63
+ 0.013422729447484016,
64
+ -0.0029037927743047476,
65
+ -0.4307878315448761,
66
+ 1.1691919565200806,
67
+ 1.5508151054382324,
68
+ 1.008196473121643,
69
+ 1.4388744831085205
70
+ ],
71
+ "min": [
72
+ -0.9901090860366821,
73
+ -0.8877645134925842,
74
+ -1.2071785926818848,
75
+ -0.03135598078370094,
76
+ 0.03392980992794037,
77
+ 0.023357456550002098,
78
+ 0.019666638225317
79
+ ],
80
+ "mean": [
81
+ -0.4129581264587618,
82
+ -0.4582520757612821,
83
+ -0.664123836057758,
84
+ 0.4952124991587111,
85
+ 0.9231042625678995,
86
+ 0.5022273107153479,
87
+ 0.669409344487608
88
+ ],
89
+ "std": [
90
+ 0.19006874361243062,
91
+ 0.11976727609652792,
92
+ 0.10119243113959121,
93
+ 0.2266161525462106,
94
+ 0.4655465641238765,
95
+ 0.21432269096327275,
96
+ 0.31467715432459836
97
+ ],
98
+ "q01": [
99
+ -0.9081442606449127,
100
+ -0.8110105991363525,
101
+ -0.9831316405534745,
102
+ 0.023696813061833383,
103
+ 0.037188550531864165,
104
+ 0.024981562048196793,
105
+ 0.02064503453671932
106
+ ],
107
+ "q99": [
108
+ -0.10628944709897117,
109
+ -0.18123969539999962,
110
+ -0.5385138392448425,
111
+ 0.9067972785234452,
112
+ 1.4886025190353394,
113
+ 0.871771514415741,
114
+ 1.3075256443023682
115
+ ]
116
+ },
117
+ "left_arm": {
118
+ "max": [
119
+ 0.6784030199050903,
120
+ 0.5408600568771362,
121
+ 0.261579692363739,
122
+ 1.2853081226348877,
123
+ 0.6677011251449585,
124
+ 0.01986505277454853,
125
+ 0.5779042840003967
126
+ ],
127
+ "min": [
128
+ -0.6697743535041809,
129
+ -0.04027898237109184,
130
+ -0.6574546098709106,
131
+ -0.02732403390109539,
132
+ -0.18146513402462006,
133
+ -1.2264176607131958,
134
+ -0.8434670567512512
135
+ ],
136
+ "mean": [
137
+ 0.06576749867032948,
138
+ 0.25579782742676094,
139
+ -0.06004482964572005,
140
+ 0.808768983211113,
141
+ 0.10850686643277818,
142
+ -0.7254681426808276,
143
+ -0.2878231007376268
144
+ ],
145
+ "std": [
146
+ 0.16998997778227107,
147
+ 0.09886796724754655,
148
+ 0.1532526690961023,
149
+ 0.3788245539398352,
150
+ 0.1304561854786525,
151
+ 0.3305712410884026,
152
+ 0.20458272924414123
153
+ ],
154
+ "q01": [
155
+ -0.472993403673172,
156
+ 0.01923264417797327,
157
+ -0.4175487527251244,
158
+ 0.11523827515542508,
159
+ -0.13969017088413238,
160
+ -1.190878005027771,
161
+ -0.7066819667816162
162
+ ],
163
+ "q99": [
164
+ 0.4539884480834007,
165
+ 0.5193723440170288,
166
+ 0.2179892195761203,
167
+ 1.2710349559783936,
168
+ 0.6194926702976227,
169
+ -0.09952061586081994,
170
+ 0.17473116487264606
171
+ ]
172
+ },
173
+ "right_arm": {
174
+ "max": [
175
+ 0.9002070426940918,
176
+ 0.11829628795385361,
177
+ 0.8515990376472473,
178
+ 1.2844573259353638,
179
+ 0.8022719621658325,
180
+ 0.6399767994880676,
181
+ 1.2358516454696655
182
+ ],
183
+ "min": [
184
+ -1.2026649713516235,
185
+ -0.6969066858291626,
186
+ -0.815610408782959,
187
+ -0.9874761700630188,
188
+ -1.6946533918380737,
189
+ -1.4213329553604126,
190
+ -1.342279314994812
191
+ ],
192
+ "mean": [
193
+ -0.0662775668920557,
194
+ -0.1345353524434497,
195
+ 0.0634982026036378,
196
+ -0.10989526924031702,
197
+ -0.05263142642975812,
198
+ -0.12096835662218453,
199
+ 0.120861711803444
200
+ ],
201
+ "std": [
202
+ 0.4008494462356859,
203
+ 0.1700786431413106,
204
+ 0.21295159275658426,
205
+ 0.43649950708158436,
206
+ 0.4497950790039162,
207
+ 0.36514804111543175,
208
+ 0.1967684071659634
209
+ ],
210
+ "q01": [
211
+ -0.950262793302536,
212
+ -0.5722128254175186,
213
+ -0.4087426000833511,
214
+ -0.8998615843057632,
215
+ -1.200806084871292,
216
+ -1.1520812797546387,
217
+ -0.3325912955403328
218
+ ],
219
+ "q99": [
220
+ 0.7100517773628232,
221
+ 0.10524750150740145,
222
+ 0.6194546324014664,
223
+ 0.9450081789493561,
224
+ 0.559214276075363,
225
+ 0.5243198603391647,
226
+ 0.6973656332492828
227
+ ]
228
+ },
229
+ "rpy": {
230
+ "max": [
231
+ 0.1827869415283203,
232
+ 0.12582087516784668,
233
+ 0.7600905299186707
234
+ ],
235
+ "min": [
236
+ -0.25714462995529175,
237
+ -0.27553248405456543,
238
+ -0.3924732208251953
239
+ ],
240
+ "mean": [
241
+ -0.07450471914543803,
242
+ -0.016358907944713997,
243
+ 0.08461220301037418
244
+ ],
245
+ "std": [
246
+ 0.052686434459038575,
247
+ 0.05521563296798423,
248
+ 0.1871730371455983
249
+ ],
250
+ "q01": [
251
+ -0.21652273923158646,
252
+ -0.15164137467741967,
253
+ -0.3411825066804886
254
+ ],
255
+ "q99": [
256
+ 0.023736179061233972,
257
+ 0.0924126328527927,
258
+ 0.6843682318925858
259
+ ]
260
+ },
261
+ "height": {
262
+ "max": [
263
+ 0.75
264
+ ],
265
+ "min": [
266
+ 0.6516909599304199
267
+ ],
268
+ "mean": [
269
+ 0.717019414027915
270
+ ],
271
+ "std": [
272
+ 0.010611909948945071
273
+ ],
274
+ "q01": [
275
+ 0.681534378528595
276
+ ],
277
+ "q99": [
278
+ 0.7377596861124038
279
+ ]
280
+ }
281
+ },
282
+ "action": {
283
+ "left_hand": {
284
+ "max": [
285
+ 0.5982731580734253,
286
+ 0.5278768539428711,
287
+ 0.5354012846946716,
288
+ 0.4014817903516814,
289
+ 0.4918059706687927,
290
+ 0.44213195890188217,
291
+ 0.4230584502220154
292
+ ],
293
+ "min": [
294
+ -0.6301521956920624,
295
+ -0.5149323344230652,
296
+ -0.529521644115448,
297
+ -0.6856191083788872,
298
+ -1.158537894487381,
299
+ -0.7385352402925491,
300
+ -0.905200719833374
301
+ ],
302
+ "mean": [
303
+ -0.017651338260789542,
304
+ 0.00866811522079485,
305
+ -0.004330770698072518,
306
+ -0.002544540625456693,
307
+ -0.0005896838413398195,
308
+ 0.002361588407552847,
309
+ 0.004366535660068055
310
+ ],
311
+ "std": [
312
+ 0.06554604732418758,
313
+ 0.07154837844491334,
314
+ 0.12856598959026014,
315
+ 0.03597542702397461,
316
+ 0.04581648449077367,
317
+ 0.03962922545114253,
318
+ 0.04346845484086631
319
+ ],
320
+ "q01": [
321
+ -0.24043308198451993,
322
+ -0.19401812851428984,
323
+ -0.39614349603652954,
324
+ -0.10097884107381103,
325
+ -0.14891792684793473,
326
+ -0.11517959907650946,
327
+ -0.1279512584209442
328
+ ],
329
+ "q99": [
330
+ 0.23256101012229896,
331
+ 0.20524268746376023,
332
+ 0.3933871388435364,
333
+ 0.09811923948582232,
334
+ 0.12937735170125958,
335
+ 0.1224899165794337,
336
+ 0.11734045296907396
337
+ ]
338
+ },
339
+ "right_hand": {
340
+ "max": [
341
+ 0.8790446370840073,
342
+ 0.642131395637989,
343
+ 0.5467694997787476,
344
+ 1.2996619567275047,
345
+ 1.698824219405651,
346
+ 1.07933783903718,
347
+ 1.2983299642801285
348
+ ],
349
+ "min": [
350
+ -0.8118728846311569,
351
+ -0.6551126688718796,
352
+ -0.8104255795478821,
353
+ -0.940872801351361,
354
+ -1.4264521131990477,
355
+ -0.9354669973254204,
356
+ -0.9567366391420364
357
+ ],
358
+ "mean": [
359
+ 0.06989124645612069,
360
+ -0.18283122662561244,
361
+ -0.07939327808296306,
362
+ 0.13485396014136192,
363
+ 0.1019227924721336,
364
+ 0.12847191988697357,
365
+ 0.06420547800869135
366
+ ],
367
+ "std": [
368
+ 0.14218628429822006,
369
+ 0.14446828671114784,
370
+ 0.1265010717040926,
371
+ 0.1855492922965264,
372
+ 0.2240967671588938,
373
+ 0.17045304446399806,
374
+ 0.1757324556093506
375
+ ],
376
+ "q01": [
377
+ -0.35978180170059204,
378
+ -0.4319852590560913,
379
+ -0.36317331790924073,
380
+ -0.523832806199789,
381
+ -0.6789782047271729,
382
+ -0.4287099301815033,
383
+ -0.4532649278640747
384
+ ],
385
+ "q99": [
386
+ 0.4718186043202869,
387
+ 0.19760750830173343,
388
+ 0.25330804288387265,
389
+ 0.6829912588000264,
390
+ 0.8806305289268437,
391
+ 0.6021148726344102,
392
+ 0.6443318486213678
393
+ ]
394
+ },
395
+ "left_arm": {
396
+ "max": [
397
+ 0.48457716405391693,
398
+ 0.3535273000597954,
399
+ 0.21352535486221313,
400
+ 0.7616467773914337,
401
+ 0.6007281094789505,
402
+ 0.41783052682876587,
403
+ 0.5165230333805084
404
+ ],
405
+ "min": [
406
+ -0.6373330056667328,
407
+ -0.15849913656711578,
408
+ -0.5613232888281345,
409
+ -0.701464831829071,
410
+ -0.2516954243183136,
411
+ -0.8871668800711632,
412
+ -0.4376880154013634
413
+ ],
414
+ "mean": [
415
+ -0.011776995941582026,
416
+ 0.00991253342440134,
417
+ -0.006852865379075551,
418
+ 0.010552169557516157,
419
+ 0.001214240309897302,
420
+ -0.014453163590611127,
421
+ -0.010611657202882456
422
+ ],
423
+ "std": [
424
+ 0.03602045474094356,
425
+ 0.024036875417496602,
426
+ 0.02602605089260403,
427
+ 0.05981475365626236,
428
+ 0.028429534988341422,
429
+ 0.08916377427180806,
430
+ 0.03528055113076741
431
+ ],
432
+ "q01": [
433
+ -0.1545458011329174,
434
+ -0.031217777729034422,
435
+ -0.11606558486819267,
436
+ -0.0627044416964054,
437
+ -0.056227393448352814,
438
+ -0.5197785004973411,
439
+ -0.156750052748248
440
+ ],
441
+ "q99": [
442
+ 0.05294616930186727,
443
+ 0.11603573272004716,
444
+ 0.02942010760307312,
445
+ 0.32693205326795444,
446
+ 0.11353727197274502,
447
+ 0.06767475279048074,
448
+ 0.06325854659080493
449
+ ]
450
+ },
451
+ "right_arm": {
452
+ "max": [
453
+ 1.0019388496875763,
454
+ 0.6778036952018738,
455
+ 0.8202316761016846,
456
+ 1.2868973910808563,
457
+ 1.6054365634918213,
458
+ 0.9513278678059578,
459
+ 1.0065587792778388
460
+ ],
461
+ "min": [
462
+ -0.8566206395626068,
463
+ -0.5352737102657557,
464
+ -0.9050570726394653,
465
+ -1.1766326129436493,
466
+ -1.1665689051151276,
467
+ -1.226991206407547,
468
+ -1.4597684778273106
469
+ ],
470
+ "mean": [
471
+ 0.0011831602733654706,
472
+ 0.004668234701338991,
473
+ -0.00464666805688978,
474
+ -0.008819726561502726,
475
+ 0.01680292255365519,
476
+ -0.023272413508623503,
477
+ 0.001828295062135536
478
+ ],
479
+ "std": [
480
+ 0.15114015459104163,
481
+ 0.10717935148199742,
482
+ 0.12210241645133885,
483
+ 0.18724624428113243,
484
+ 0.21873655609526446,
485
+ 0.18543346050682308,
486
+ 0.1150104843557695
487
+ ],
488
+ "q01": [
489
+ -0.39952214881777764,
490
+ -0.25475079081952573,
491
+ -0.4339929953217506,
492
+ -0.645755667425692,
493
+ -0.5597020275890827,
494
+ -0.640592061728239,
495
+ -0.3658384716138244
496
+ ],
497
+ "q99": [
498
+ 0.47152970274910266,
499
+ 0.3625141793861984,
500
+ 0.3024886786937714,
501
+ 0.5137429375201458,
502
+ 0.7895279735326737,
503
+ 0.5061310570687053,
504
+ 0.30551581159233987
505
+ ]
506
+ },
507
+ "rpy": {
508
+ "max": [
509
+ 0.2602043002843857,
510
+ 0.0978180319070816,
511
+ 0.4442971870303154
512
+ ],
513
+ "min": [
514
+ -0.2339760884642601,
515
+ -0.27553248405456543,
516
+ -0.44536447897553444
517
+ ],
518
+ "mean": [
519
+ -0.0005869882495184713,
520
+ 0.0015875524968715301,
521
+ 0.0008563187639556434
522
+ ],
523
+ "std": [
524
+ 0.015963769360634213,
525
+ 0.009780051628911945,
526
+ 0.024366577111250988
527
+ ],
528
+ "q01": [
529
+ -0.049293445050716395,
530
+ -0.023501918860711155,
531
+ -0.0595750443637371
532
+ ],
533
+ "q99": [
534
+ 0.05025968039408321,
535
+ 0.028102314658462975,
536
+ 0.0824947778135533
537
+ ]
538
+ },
539
+ "height": {
540
+ "max": [
541
+ 0.04315239191055298
542
+ ],
543
+ "min": [
544
+ -0.08728241920471191
545
+ ],
546
+ "mean": [
547
+ 5.191507986045299e-05
548
+ ],
549
+ "std": [
550
+ 0.004163189516331377
551
+ ],
552
+ "q01": [
553
+ -0.012046998739242552
554
+ ],
555
+ "q99": [
556
+ 0.012092745304107644
557
+ ]
558
+ },
559
+ "torso_vx": {
560
+ "max": [
561
+ 0.3499999940395355
562
+ ],
563
+ "min": [
564
+ 0.0
565
+ ],
566
+ "mean": [
567
+ 0.017269024247052768
568
+ ],
569
+ "std": [
570
+ 0.07580197349073346
571
+ ],
572
+ "q01": [
573
+ 0.0
574
+ ],
575
+ "q99": [
576
+ 0.3499999940395355
577
+ ]
578
+ },
579
+ "torso_vy": {
580
+ "max": [
581
+ 0.5
582
+ ],
583
+ "min": [
584
+ 0.0
585
+ ],
586
+ "mean": [
587
+ 0.0002320065992988245
588
+ ],
589
+ "std": [
590
+ 0.01076798368252486
591
+ ],
592
+ "q01": [
593
+ 0.0
594
+ ],
595
+ "q99": [
596
+ 0.0
597
+ ]
598
+ },
599
+ "torso_vyaw": {
600
+ "max": [
601
+ 0.5
602
+ ],
603
+ "min": [
604
+ -0.5
605
+ ],
606
+ "mean": [
607
+ -0.03733115137363985
608
+ ],
609
+ "std": [
610
+ 0.1199053455233051
611
+ ],
612
+ "q01": [
613
+ -0.5
614
+ ],
615
+ "q99": [
616
+ 0.0
617
+ ]
618
+ },
619
+ "torso_dyaw": {
620
+ "max": [
621
+ 0.3072107136249542
622
+ ],
623
+ "min": [
624
+ -0.1875186562538147
625
+ ],
626
+ "mean": [
627
+ 0.016342195046893805
628
+ ],
629
+ "std": [
630
+ 0.048962870720188865
631
+ ],
632
+ "q01": [
633
+ -0.005849878406152129
634
+ ],
635
+ "q99": [
636
+ 0.22258896410465237
637
+ ]
638
+ }
639
+ }
640
+ },
641
+ "modalities": {
642
+ "video": {
643
+ "egocentric": {
644
+ "resolution": [
645
+ 640,
646
+ 480
647
+ ],
648
+ "channels": 3,
649
+ "fps": 30.0
650
+ }
651
+ },
652
+ "state": {
653
+ "left_hand": {
654
+ "absolute": true,
655
+ "rotation_type": null,
656
+ "shape": [
657
+ 7
658
+ ],
659
+ "continuous": true
660
+ },
661
+ "right_hand": {
662
+ "absolute": true,
663
+ "rotation_type": null,
664
+ "shape": [
665
+ 7
666
+ ],
667
+ "continuous": true
668
+ },
669
+ "left_arm": {
670
+ "absolute": true,
671
+ "rotation_type": null,
672
+ "shape": [
673
+ 7
674
+ ],
675
+ "continuous": true
676
+ },
677
+ "right_arm": {
678
+ "absolute": true,
679
+ "rotation_type": null,
680
+ "shape": [
681
+ 7
682
+ ],
683
+ "continuous": true
684
+ },
685
+ "rpy": {
686
+ "absolute": true,
687
+ "rotation_type": null,
688
+ "shape": [
689
+ 3
690
+ ],
691
+ "continuous": true
692
+ },
693
+ "height": {
694
+ "absolute": true,
695
+ "rotation_type": null,
696
+ "shape": [
697
+ 1
698
+ ],
699
+ "continuous": true
700
+ }
701
+ },
702
+ "action": {
703
+ "left_hand": {
704
+ "absolute": true,
705
+ "rotation_type": null,
706
+ "shape": [
707
+ 7
708
+ ],
709
+ "continuous": true
710
+ },
711
+ "right_hand": {
712
+ "absolute": true,
713
+ "rotation_type": null,
714
+ "shape": [
715
+ 7
716
+ ],
717
+ "continuous": true
718
+ },
719
+ "left_arm": {
720
+ "absolute": true,
721
+ "rotation_type": null,
722
+ "shape": [
723
+ 7
724
+ ],
725
+ "continuous": true
726
+ },
727
+ "right_arm": {
728
+ "absolute": true,
729
+ "rotation_type": null,
730
+ "shape": [
731
+ 7
732
+ ],
733
+ "continuous": true
734
+ },
735
+ "rpy": {
736
+ "absolute": true,
737
+ "rotation_type": null,
738
+ "shape": [
739
+ 3
740
+ ],
741
+ "continuous": true
742
+ },
743
+ "height": {
744
+ "absolute": true,
745
+ "rotation_type": null,
746
+ "shape": [
747
+ 1
748
+ ],
749
+ "continuous": true
750
+ },
751
+ "torso_vx": {
752
+ "absolute": false,
753
+ "rotation_type": null,
754
+ "shape": [
755
+ 1
756
+ ],
757
+ "continuous": true
758
+ },
759
+ "torso_vy": {
760
+ "absolute": false,
761
+ "rotation_type": null,
762
+ "shape": [
763
+ 1
764
+ ],
765
+ "continuous": true
766
+ },
767
+ "torso_vyaw": {
768
+ "absolute": false,
769
+ "rotation_type": null,
770
+ "shape": [
771
+ 1
772
+ ],
773
+ "continuous": true
774
+ },
775
+ "torso_dyaw": {
776
+ "absolute": false,
777
+ "rotation_type": null,
778
+ "shape": [
779
+ 1
780
+ ],
781
+ "continuous": true
782
+ }
783
+ }
784
+ },
785
+ "embodiment_tag": "real_teleop_g1"
786
+ }
787
+ }
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e54b0d78312d085adbb1b47afb1539a753e1648a42e3961d15383011d5d41dad
3
+ size 24726494997
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e7d4159a5f78e298eea595df891039c8eb3e55150e04ecd7e6e77007cbccde8
3
+ size 24726494869
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00f4cf817a1d4526c9127d904d8e30defe5fba49dfc8d3d18e1b5fd9afcf2b3a
3
+ size 24726494933
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1064a887d1d9df9091382c220381cfe871b770e7878d74d20851111e3cfa7595
3
+ size 24726495189
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ac6d1ec241aa1e4252c528f9fdfbeaa6edec9afab9a946c1d1d7c0edd4db60a
3
+ size 24726494869
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3e8679f10db492ccc08552a1eadcc77563097a20661e5eef2ff9a923abf8ef5
3
+ size 24726494805
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00228701cf4e15e5be08166f7c24db907a4f021ac28c178a92f4141e8c87ea2c
3
+ size 24726494805
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1319970345100e63d9190905d280b43f52ae42bbc37c8210b8ddfbbdcb375c5
3
+ size 24726495317
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/global_step20000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5694cb3d62897774b80e800e5d8381c3db0afadcbcd18fff7660848e74473f6b
3
+ size 58729254967
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step20000
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00001-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0e0ff763ed8bb6b855782e10f1577d676bc7abce2df0e46cea5d340a05963a5
3
+ size 4935816720
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00002-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dd6a6ed2735b920badc8b58864ac567f921b5e1ab537f9c755e35e6c65ebd8c
3
+ size 4983107552
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00003-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b24734a620ebeda5fda8ba3860115a53ba609630eebeef8be48e9c585c8a80e
3
+ size 4937616256
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00004-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25fa20cfabcc3e779f0d47b2f4dfec0e1746e5a5834a74124cf5fbb4cf75769c
3
+ size 4987670360
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00005-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6de3521b2cca697af9c1be60f81359e9c2f21325ab881f5cb9a9da0beaeb85f7
3
+ size 4987714608
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00006-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5659d860ff59c32f444ec88eac2f0e5b05d04d931d7bd83672057d43c5786155
3
+ size 4950963456
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00007-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c27719ee648bb3595b6813d99678c6ea15b8d5f07953cbd5096863ce612a274
3
+ size 4951004896
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00008-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c81c30844b10d612be6a677b7ee39c168f08eeef2e488b8943bbf87b19569c87
3
+ size 4950963472
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00009-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0bdd073dfa80f90311e8690244be6960bd25dbf7231d07029afa3ce6531e030
3
+ size 4950984200
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model-00010-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2343d0348efd68e66fd5377236a217e0d8f3edd9213d37e815ab4200d26fb97
3
+ size 1212823208
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260324_0/000000_03_24_13_51_02_n1.mp4 ADDED
Binary file (16.9 kB). View file
 
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260324_0/000001_03_24_13_58_27_n34.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:397a4f513fd768f0dda4f6e09307b4858693d60f759d3b17711d7b2b4c8a7e31
3
+ size 664300
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260326_0/000000_03_26_17_26_53_n40.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d62daa6a188565dad50c8354387857de48434cea229883fd00af554245ac55a6
3
+ size 793290
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260326_0/000001_03_26_17_31_03_n40.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d62daa6a188565dad50c8354387857de48434cea229883fd00af554245ac55a6
3
+ size 793290
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260327_0/000000_03_27_02_53_54_n50.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a69d9c7826be63d6da2a38bba1529652c64dcabd1261e2d723cae4c035849d5
3
+ size 995418
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/real_world_eval_gen_20260327_0/000001_03_27_02_55_16_n40.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9da95fd035e16e89c0ec661c85aa36416f007c6c79a9f36e87b27f7109e00818
3
+ size 792418
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cae4b6de2acf6264a5c481dcbd7aa866d94dca1326cdbccff19f3fef8f7c53f
3
+ size 16389
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fea41920daf9f3742ab29b10d916136a3d60dd6eb2a1e03f018d35e54ef2e868
3
+ size 16389
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:099243b837d52dab61610fca2e997abd8a3c6e4ede0d8ccc83eee5e47383ded5
3
+ size 16389
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be34cfa1b5e67c3e539fb03abbe775743b14e7fa4a17684d9400f13e8d16b341
3
+ size 16389
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fec0da59f18a4474684abde6d1267e41f007667a870ca3c08f7467fd1381a20
3
+ size 16389
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d71402cde94b72b47be2a664a4c005d140739f9eec545f070ca9677825a39dc2
3
+ size 16389
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:841612644c79a5764b67f40bd1a9be5ee750c520b8b2a207b8bfd46f9cb4b5cf
3
+ size 16389
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f970d4dde7f6abbd2c2a190732b19c85e7aafed87115303a61018145f87ecd7a
3
+ size 16389
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28508b0a940eb7f712597070ad628ccf8634a0efd412740f51a02d7843f5d668
3
+ size 1465
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "dreamzero", "run_id": ""}
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/checkpoint-20000/zero_to_fp32.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import gc
25
+ import json
26
+ import numpy as np
27
+ from tqdm import tqdm
28
+ from collections import OrderedDict
29
+ from dataclasses import dataclass
30
+
31
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
+ # DeepSpeed data structures it has to be available in the current python environment.
33
+ from deepspeed.utils import logger
34
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
+
38
+
39
+ @dataclass
40
+ class zero_model_state:
41
+ buffers: dict()
42
+ param_shapes: dict()
43
+ shared_params: list
44
+ ds_version: int
45
+ frozen_param_shapes: dict()
46
+ frozen_param_fragments: dict()
47
+
48
+
49
+ debug = 0
50
+
51
+ # load to cpu
52
+ device = torch.device('cpu')
53
+
54
+
55
+ def atoi(text):
56
+ return int(text) if text.isdigit() else text
57
+
58
+
59
+ def natural_keys(text):
60
+ '''
61
+ alist.sort(key=natural_keys) sorts in human order
62
+ http://nedbatchelder.com/blog/200712/human_sorting.html
63
+ (See Toothy's implementation in the comments)
64
+ '''
65
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
66
+
67
+
68
+ def get_model_state_file(checkpoint_dir, zero_stage):
69
+ if not os.path.isdir(checkpoint_dir):
70
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
+
72
+ # there should be only one file
73
+ if zero_stage <= 2:
74
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
+ elif zero_stage == 3:
76
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
+
78
+ if not os.path.exists(file):
79
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
80
+
81
+ return file
82
+
83
+
84
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
+ # XXX: need to test that this simple glob rule works for multi-node setup too
86
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
+
88
+ if len(ckpt_files) == 0:
89
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
+
91
+ return ckpt_files
92
+
93
+
94
+ def get_optim_files(checkpoint_dir):
95
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
+
97
+
98
+ def get_model_state_files(checkpoint_dir):
99
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
+
101
+
102
+ def parse_model_states(files):
103
+ zero_model_states = []
104
+ for file in files:
105
+ state_dict = torch.load(file, map_location=device, weights_only=False)
106
+
107
+ if BUFFER_NAMES not in state_dict:
108
+ raise ValueError(f"{file} is not a model state checkpoint")
109
+ buffer_names = state_dict[BUFFER_NAMES]
110
+ if debug:
111
+ print("Found buffers:", buffer_names)
112
+
113
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
+ param_shapes = state_dict[PARAM_SHAPES]
116
+
117
+ # collect parameters that are included in param_shapes
118
+ param_names = []
119
+ for s in param_shapes:
120
+ for name in s.keys():
121
+ param_names.append(name)
122
+
123
+ # update with frozen parameters
124
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
+ if frozen_param_shapes is not None:
126
+ if debug:
127
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
+ param_names += list(frozen_param_shapes.keys())
129
+
130
+ # handle shared params
131
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
+
133
+ ds_version = state_dict.get(DS_VERSION, None)
134
+
135
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
+
137
+ z_model_state = zero_model_state(buffers=buffers,
138
+ param_shapes=param_shapes,
139
+ shared_params=shared_params,
140
+ ds_version=ds_version,
141
+ frozen_param_shapes=frozen_param_shapes,
142
+ frozen_param_fragments=frozen_param_fragments)
143
+ zero_model_states.append(z_model_state)
144
+
145
+ return zero_model_states
146
+
147
+
148
+ def parse_optim_states(files, ds_checkpoint_dir):
149
+ total_files = len(files)
150
+ state_dicts = []
151
+ for f in tqdm(files, desc='Loading checkpoint shards'):
152
+ state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
+ # and also handle the case where it was already removed by another helper script
155
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
+ state_dicts.append(state_dict)
157
+
158
+ if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
160
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
+
163
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
165
+ # use the max of the partition_count to get the dp world_size.
166
+
167
+ if type(world_size) is list:
168
+ world_size = max(world_size)
169
+
170
+ if world_size != total_files:
171
+ raise ValueError(
172
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
+ )
175
+
176
+ # the groups are named differently in each stage
177
+ if zero_stage <= 2:
178
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
+ elif zero_stage == 3:
180
+ fp32_groups_key = FP32_FLAT_GROUPS
181
+ else:
182
+ raise ValueError(f"unknown zero stage {zero_stage}")
183
+
184
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
+ return zero_stage, world_size, fp32_flat_groups
186
+
187
+
188
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
+ """
190
+ Returns fp32 state_dict reconstructed from ds checkpoint
191
+
192
+ Args:
193
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
+
195
+ """
196
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
+
198
+ optim_files = get_optim_files(ds_checkpoint_dir)
199
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
+
202
+ model_files = get_model_state_files(ds_checkpoint_dir)
203
+
204
+ zero_model_states = parse_model_states(model_files)
205
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
+
207
+ if zero_stage <= 2:
208
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
+ exclude_frozen_parameters)
210
+ elif zero_stage == 3:
211
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
+ exclude_frozen_parameters)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _has_callable(obj, fn):
248
+ attr = getattr(obj, fn, None)
249
+ return callable(attr)
250
+
251
+
252
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
+ param_shapes = zero_model_states[0].param_shapes
254
+
255
+ # Reconstruction protocol:
256
+ #
257
+ # XXX: document this
258
+
259
+ if debug:
260
+ for i in range(world_size):
261
+ for j in range(len(fp32_flat_groups[0])):
262
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
+
264
+ # XXX: memory usage doubles here (zero2)
265
+ num_param_groups = len(fp32_flat_groups[0])
266
+ merged_single_partition_of_fp32_groups = []
267
+ for i in range(num_param_groups):
268
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
+ avail_numel = sum(
272
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
+
274
+ if debug:
275
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
276
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
+ # not asserting if there is a mismatch due to possible padding
278
+ print(f"Have {avail_numel} numels to process.")
279
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
+
281
+ # params
282
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
+ # out-of-core computing solution
284
+ total_numel = 0
285
+ total_params = 0
286
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
+ offset = 0
288
+ avail_numel = full_single_fp32_vector.numel()
289
+ for name, shape in shapes.items():
290
+
291
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
+ total_numel += unpartitioned_numel
293
+ total_params += 1
294
+
295
+ if debug:
296
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
+ offset += unpartitioned_numel
299
+
300
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
+ # live optimizer object, so we are checking that the numbers are within the right range
304
+ align_to = 2 * world_size
305
+
306
+ def zero2_align(x):
307
+ return align_to * math.ceil(x / align_to)
308
+
309
+ if debug:
310
+ print(f"original offset={offset}, avail_numel={avail_numel}")
311
+
312
+ offset = zero2_align(offset)
313
+ avail_numel = zero2_align(avail_numel)
314
+
315
+ if debug:
316
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
+
318
+ # Sanity check
319
+ if offset != avail_numel:
320
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
+
322
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
+
324
+
325
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
+ exclude_frozen_parameters):
327
+ state_dict = OrderedDict()
328
+
329
+ # buffers
330
+ buffers = zero_model_states[0].buffers
331
+ state_dict.update(buffers)
332
+ if debug:
333
+ print(f"added {len(buffers)} buffers")
334
+
335
+ if not exclude_frozen_parameters:
336
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
337
+
338
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
+
340
+ # recover shared parameters
341
+ for pair in zero_model_states[0].shared_params:
342
+ if pair[1] in state_dict:
343
+ state_dict[pair[0]] = state_dict[pair[1]]
344
+
345
+ return state_dict
346
+
347
+
348
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
+ remainder = unpartitioned_numel % world_size
350
+ padding_numel = (world_size - remainder) if remainder else 0
351
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
+ return partitioned_numel, padding_numel
353
+
354
+
355
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
+ return
358
+
359
+ if debug:
360
+ for i in range(world_size):
361
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
+
364
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
+ wanted_params = len(frozen_param_shapes)
366
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
+ print(f'Frozen params: Have {avail_numel} numels to process.')
369
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
+
371
+ total_params = 0
372
+ total_numel = 0
373
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
+ total_params += 1
375
+ unpartitioned_numel = shape.numel()
376
+ total_numel += unpartitioned_numel
377
+
378
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
+
381
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
+
383
+ if debug:
384
+ print(
385
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
+ )
387
+
388
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
+
390
+
391
+ class GatheredTensor:
392
+ """
393
+ A pseudo tensor that collects partitioned weights.
394
+ It is more memory efficient when there are multiple groups.
395
+ """
396
+
397
+ def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
+ self.flat_groups = flat_groups
399
+ self.flat_groups_offset = flat_groups_offset
400
+ self.offset = offset
401
+ self.partitioned_numel = partitioned_numel
402
+ self.shape = shape
403
+ self.dtype = self.flat_groups[0][0].dtype
404
+
405
+ def contiguous(self):
406
+ """
407
+ Merge partitioned weights from flat_groups into a single tensor.
408
+ """
409
+ end_idx = self.offset + self.partitioned_numel
410
+ world_size = len(self.flat_groups)
411
+ pad_flat_param_chunks = []
412
+
413
+ for rank_i in range(world_size):
414
+ # for each rank, we need to collect weights from related group/groups
415
+ flat_groups_at_rank_i = self.flat_groups[rank_i]
416
+ start_group_id = None
417
+ end_group_id = None
418
+ for group_id in range(len(self.flat_groups_offset)):
419
+ if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
+ start_group_id = group_id
421
+ if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
+ end_group_id = group_id
423
+ break
424
+ # collect weights from related group/groups
425
+ for group_id in range(start_group_id, end_group_id + 1):
426
+ flat_tensor = flat_groups_at_rank_i[group_id]
427
+ start_offset = self.offset - self.flat_groups_offset[group_id]
428
+ end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
+ pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
+
431
+ # collect weights from all ranks
432
+ pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
+ param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
+ return param
435
+
436
+
437
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
+ param_shapes = zero_model_states[0].param_shapes
439
+ avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
+
441
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
+ # param, re-consolidating each param, while dealing with padding if any
443
+
444
+ # merge list of dicts, preserving order
445
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
+
447
+ if debug:
448
+ for i in range(world_size):
449
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
+
451
+ wanted_params = len(param_shapes)
452
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
+ # not asserting if there is a mismatch due to possible padding
454
+ avail_numel = fp32_flat_groups[0].numel() * world_size
455
+ print(f"Trainable params: Have {avail_numel} numels to process.")
456
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
+
458
+ # params
459
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
+ # out-of-core computing solution
461
+ offset = 0
462
+ total_numel = 0
463
+ total_params = 0
464
+ flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
+ unpartitioned_numel = shape.numel()
467
+ total_numel += unpartitioned_numel
468
+ total_params += 1
469
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
+
471
+ if debug:
472
+ print(
473
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
+ )
475
+
476
+ # memory efficient tensor
477
+ tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
+ state_dict[name] = tensor
479
+ offset += partitioned_numel
480
+
481
+ offset *= world_size
482
+
483
+ # Sanity check
484
+ if offset != avail_numel:
485
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
+
487
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
+
489
+
490
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
+ exclude_frozen_parameters):
492
+ state_dict = OrderedDict()
493
+
494
+ # buffers
495
+ buffers = zero_model_states[0].buffers
496
+ state_dict.update(buffers)
497
+ if debug:
498
+ print(f"added {len(buffers)} buffers")
499
+
500
+ if not exclude_frozen_parameters:
501
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
+
503
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
+
505
+ # recover shared parameters
506
+ for pair in zero_model_states[0].shared_params:
507
+ if pair[1] in state_dict:
508
+ state_dict[pair[0]] = state_dict[pair[1]]
509
+
510
+ return state_dict
511
+
512
+
513
+ def to_torch_tensor(state_dict, return_empty_tensor=False):
514
+ """
515
+ Convert state_dict of GatheredTensor to torch tensor
516
+ """
517
+ torch_state_dict = {}
518
+ converted_tensors = {}
519
+ for name, tensor in state_dict.items():
520
+ tensor_id = id(tensor)
521
+ if tensor_id in converted_tensors: # shared tensors
522
+ shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
+ torch_state_dict[name] = shared_tensor
524
+ else:
525
+ converted_tensors[tensor_id] = name
526
+ if return_empty_tensor:
527
+ torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
+ else:
529
+ torch_state_dict[name] = tensor.contiguous()
530
+ return torch_state_dict
531
+
532
+
533
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
+ tag=None,
535
+ exclude_frozen_parameters=False,
536
+ lazy_mode=False):
537
+ """
538
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
+ via a model hub.
541
+
542
+ Args:
543
+ - ``checkpoint_dir``: path to the desired checkpoint folder
544
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
+ - ``exclude_frozen_parameters``: exclude frozen parameters
546
+ - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
+ Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
+
549
+ Returns:
550
+ - pytorch ``state_dict``
551
+
552
+ A typical usage might be ::
553
+
554
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
+ # do the training and checkpoint saving
556
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
+ model = model.cpu() # move to cpu
558
+ model.load_state_dict(state_dict)
559
+ # submit to model hub or save the model to share with others
560
+
561
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
+ application. i.e. you will need to re-initialize the deepspeed engine, since
563
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
+
565
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
+
567
+ Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
+ You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
+ the checkpoint. Or you can load state_dict in lazy mode ::
570
+
571
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
+ for name, lazy_tensor in state_dict.item():
574
+ tensor = lazy_tensor.contiguous() # to cpu
575
+ print(name, tensor)
576
+ # del tensor to release memory if it no longer in use
577
+ """
578
+ if tag is None:
579
+ latest_path = os.path.join(checkpoint_dir, 'latest')
580
+ if os.path.isfile(latest_path):
581
+ with open(latest_path, 'r') as fd:
582
+ tag = fd.read().strip()
583
+ else:
584
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
+
586
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
+
588
+ if not os.path.isdir(ds_checkpoint_dir):
589
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
+
591
+ state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
+ if lazy_mode:
593
+ return state_dict
594
+ else:
595
+ return to_torch_tensor(state_dict)
596
+
597
+
598
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
+ output_dir,
600
+ max_shard_size="5GB",
601
+ safe_serialization=False,
602
+ tag=None,
603
+ exclude_frozen_parameters=False):
604
+ """
605
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
+
608
+ Args:
609
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
+ - ``exclude_frozen_parameters``: exclude frozen parameters
615
+ """
616
+
617
+ # Dependency pre-check
618
+ if safe_serialization:
619
+ try:
620
+ from safetensors.torch import save_file
621
+ except ImportError:
622
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
+ raise
624
+ if max_shard_size is not None:
625
+ try:
626
+ from huggingface_hub import split_torch_state_dict_into_shards
627
+ except ImportError:
628
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
+ raise
630
+
631
+ # Convert zero checkpoint to state_dict
632
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
+ tag,
634
+ exclude_frozen_parameters,
635
+ lazy_mode=True)
636
+
637
+ # Shard the model if it is too big.
638
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
+ if max_shard_size is not None:
640
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
+ # an memory-efficient approach for sharding
642
+ empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
+ state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
+ filename_pattern=filename_pattern,
645
+ max_shard_size=max_shard_size)
646
+ else:
647
+ from collections import namedtuple
648
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
+ state_dict_split = StateDictSplit(is_sharded=False,
650
+ filename_to_tensors={weights_name: list(state_dict.keys())})
651
+
652
+ # Save the model by shard
653
+ os.makedirs(output_dir, exist_ok=True)
654
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
+ shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
+ shard_state_dict = to_torch_tensor(shard_state_dict)
658
+ output_path = os.path.join(output_dir, shard_file)
659
+ if safe_serialization:
660
+ save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
+ else:
662
+ torch.save(shard_state_dict, output_path)
663
+ # release the memory of current shard
664
+ for tensor_name in list(shard_state_dict.keys()):
665
+ del state_dict[tensor_name]
666
+ del shard_state_dict[tensor_name]
667
+ del shard_state_dict
668
+ gc.collect()
669
+
670
+ # Save index if sharded
671
+ if state_dict_split.is_sharded:
672
+ index = {
673
+ "metadata": state_dict_split.metadata,
674
+ "weight_map": state_dict_split.tensor_to_filename,
675
+ }
676
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
+ save_index_file = os.path.join(output_dir, save_index_file)
678
+ with open(save_index_file, "w", encoding="utf-8") as f:
679
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
+ f.write(content)
681
+
682
+
683
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
+ """
685
+ 1. Put the provided model to cpu
686
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
+ 3. Load it into the provided model
688
+
689
+ Args:
690
+ - ``model``: the model object to update
691
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
+
694
+ Returns:
695
+ - ``model`: modified model
696
+
697
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
698
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
+ conveniently placed for you in the checkpoint folder.
700
+
701
+ A typical usage might be ::
702
+
703
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
+ # submit to model hub or save the model to share with others
706
+
707
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
+
711
+ """
712
+ logger.info("Extracting fp32 weights")
713
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
+
715
+ logger.info("Overwriting model with fp32 weights")
716
+ model = model.cpu()
717
+ model.load_state_dict(state_dict, strict=False)
718
+
719
+ return model
720
+
721
+
722
+ if __name__ == "__main__":
723
+ parser = argparse.ArgumentParser()
724
+ parser.add_argument("checkpoint_dir",
725
+ type=str,
726
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
+ parser.add_argument("output_dir",
728
+ type=str,
729
+ help="directory to the pytorch fp32 state_dict output files"
730
+ "(e.g. path/checkpoint-12-output/)")
731
+ parser.add_argument(
732
+ "--max_shard_size",
733
+ type=str,
734
+ default="5GB",
735
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
+ "without CPU OOM issues.")
739
+ parser.add_argument(
740
+ "--safe_serialization",
741
+ default=False,
742
+ action='store_true',
743
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
+ parser.add_argument("-t",
745
+ "--tag",
746
+ type=str,
747
+ default=None,
748
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
+ args = parser.parse_args()
752
+
753
+ debug = args.debug
754
+
755
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
+ args.output_dir,
757
+ max_shard_size=args.max_shard_size,
758
+ safe_serialization=args.safe_serialization,
759
+ tag=args.tag,
760
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,3318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: groot.vla.model.dreamzero.base_vla.VLA
3
+ _convert_: object
4
+ config:
5
+ _target_: groot.vla.model.dreamzero.base_vla.VLAConfig
6
+ _recursive_: false
7
+ model_dtype: float32
8
+ hidden_size: 0
9
+ action_horizon: 24
10
+ action_dim: 36
11
+ backbone_cfg:
12
+ _target_: groot.vla.model.dreamzero.backbone.identity.IdentityBackbone
13
+ action_head_cfg:
14
+ config:
15
+ backbone_features_projector_cfg: null
16
+ _target_: groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHeadConfig
17
+ _recursive_: false
18
+ tiled: false
19
+ tile_size_height: 34
20
+ tile_size_width: 34
21
+ tile_stride_height: 18
22
+ tile_stride_width: 16
23
+ lora_rank: 4
24
+ lora_alpha: 4
25
+ num_frames: 33
26
+ num_frame_per_block: 2
27
+ lora_target_modules: q,k,v,o,ffn.0,ffn.2
28
+ init_lora_weights: kaiming
29
+ train_architecture: full
30
+ use_gradient_checkpointing: true
31
+ add_pos_embed: true
32
+ model_dtype: float32
33
+ max_state_dim: 64
34
+ max_action_dim: 36
35
+ action_loss_embodiment_ids:
36
+ - 26
37
+ - 17
38
+ - 32
39
+ hidden_size: 64
40
+ input_embedding_dim: 1536
41
+ backbone_embedding_dim: 0
42
+ repa_layer: 8
43
+ repa_coeff: 1.0
44
+ load_pretrained_det_decode_layer_path: null
45
+ freeze_decode_layer: false
46
+ expand_batch: null
47
+ use_vlln: true
48
+ vl_self_attention_cfg:
49
+ _target_: groot.vla.model.n1_5.modules.cross_attention_dit.SelfAttentionTransformer
50
+ positional_embeddings: null
51
+ num_layers: 4
52
+ num_attention_heads: 24
53
+ attention_head_dim: 64
54
+ dropout: 0.2
55
+ final_dropout: true
56
+ diffusion_model_cfg:
57
+ _target_: groot.vla.model.dreamzero.modules.wan_video_dit_action_casual_chunk.CausalWanModel
58
+ _convert_: object
59
+ diffusion_model_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P
60
+ model_type: i2v
61
+ frame_seqlen: 220
62
+ dim: 5120
63
+ in_dim: 36
64
+ ffn_dim: 13824
65
+ out_dim: 16
66
+ freq_dim: 256
67
+ eps: 1.0e-06
68
+ num_heads: 40
69
+ num_layers: 40
70
+ max_chunk_size: 4
71
+ num_frame_per_block: 2
72
+ num_action_per_block: 24
73
+ num_state_per_block: 1
74
+ action_dim: 36
75
+ text_encoder_cfg:
76
+ _target_: groot.vla.model.dreamzero.modules.wan_video_text_encoder.WanTextEncoder
77
+ _convert_: object
78
+ text_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth
79
+ image_encoder_cfg:
80
+ _target_: groot.vla.model.dreamzero.modules.wan_video_image_encoder.WanImageEncoder
81
+ _convert_: object
82
+ image_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth
83
+ vae_cfg:
84
+ _target_: groot.vla.model.dreamzero.modules.wan_video_vae.WanVideoVAE
85
+ _convert_: object
86
+ vae_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/Wan2.1_VAE.pth
87
+ action_dim: 36
88
+ action_horizon: 24
89
+ num_inference_timesteps: 4
90
+ noise_beta_alpha: 1.5
91
+ noise_beta_beta: 1.0
92
+ noise_s: 0.999
93
+ num_timestep_buckets: 1000
94
+ decouple_video_action_noise: false
95
+ video_noise_beta_alpha: 3.0
96
+ video_noise_beta_beta: 1.0
97
+ tune_projector: true
98
+ tune_diffusion_model: true
99
+ skip_component_loading: true
100
+ _target_: groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead
101
+ _convert_: object
102
+ train_dataset:
103
+ _target_: groot.vla.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec
104
+ _convert_: object
105
+ mixture_spec:
106
+ - dataset_path:
107
+ real_teleop_g1:
108
+ - /hfm/boqian/liboqian_data/data/real_data/gear/g1/Pick_bottle_and_turn_and_pour_into_cup
109
+ dataset_weight: 1.0
110
+ distribute_weights: true
111
+ dataset_class: groot.vla.data.dataset.lerobot_sharded.ShardedLeRobotSubLangSingleActionChunkDatasetDROID
112
+ all_modality_configs:
113
+ oxe_droid:
114
+ video:
115
+ _target_: groot.vla.data.dataset.ModalityConfig
116
+ delta_indices:
117
+ - 0
118
+ - 1
119
+ - 2
120
+ - 3
121
+ - 4
122
+ - 5
123
+ - 6
124
+ - 7
125
+ - 8
126
+ - 9
127
+ - 10
128
+ - 11
129
+ - 12
130
+ - 13
131
+ - 14
132
+ - 15
133
+ - 16
134
+ - 17
135
+ - 18
136
+ - 19
137
+ - 20
138
+ - 21
139
+ - 22
140
+ - 23
141
+ - 24
142
+ eval_delta_indices:
143
+ - 0
144
+ modality_keys:
145
+ - video.exterior_image_1_left
146
+ - video.exterior_image_2_left
147
+ - video.wrist_image_left
148
+ state:
149
+ _target_: groot.vla.data.dataset.ModalityConfig
150
+ delta_indices:
151
+ - 0
152
+ modality_keys:
153
+ - state.joint_position
154
+ - state.gripper_position
155
+ action:
156
+ _target_: groot.vla.data.dataset.ModalityConfig
157
+ delta_indices:
158
+ - 0
159
+ - 1
160
+ - 2
161
+ - 3
162
+ - 4
163
+ - 5
164
+ - 6
165
+ - 7
166
+ - 8
167
+ - 9
168
+ - 10
169
+ - 11
170
+ - 12
171
+ - 13
172
+ - 14
173
+ - 15
174
+ - 16
175
+ - 17
176
+ - 18
177
+ - 19
178
+ - 20
179
+ - 21
180
+ - 22
181
+ - 23
182
+ modality_keys:
183
+ - action.joint_position
184
+ - action.gripper_position
185
+ language:
186
+ _target_: groot.vla.data.dataset.ModalityConfig
187
+ delta_indices:
188
+ - 0
189
+ modality_keys:
190
+ - annotation.language.language_instruction
191
+ - annotation.language.language_instruction_2
192
+ - annotation.language.language_instruction_3
193
+ lapa_action:
194
+ _target_: groot.vla.data.dataset.ModalityConfig
195
+ delta_indices:
196
+ - 0
197
+ modality_keys:
198
+ - lapa_action
199
+ agibot:
200
+ video:
201
+ _target_: groot.vla.data.dataset.ModalityConfig
202
+ delta_indices:
203
+ - 0
204
+ - 1
205
+ - 2
206
+ - 3
207
+ - 4
208
+ - 5
209
+ - 6
210
+ - 7
211
+ - 8
212
+ - 9
213
+ - 10
214
+ - 11
215
+ - 12
216
+ - 13
217
+ - 14
218
+ - 15
219
+ - 16
220
+ - 17
221
+ - 18
222
+ - 19
223
+ - 20
224
+ - 21
225
+ - 22
226
+ - 23
227
+ - 24
228
+ eval_delta_indices:
229
+ - -3
230
+ - -2
231
+ - -1
232
+ - 0
233
+ modality_keys:
234
+ - video.top_head
235
+ - video.hand_left
236
+ - video.hand_right
237
+ state:
238
+ _target_: groot.vla.data.dataset.ModalityConfig
239
+ delta_indices:
240
+ - 0
241
+ modality_keys:
242
+ - state.left_arm_joint_position
243
+ - state.right_arm_joint_position
244
+ - state.left_effector_position
245
+ - state.right_effector_position
246
+ - state.head_position
247
+ - state.waist_position
248
+ action:
249
+ _target_: groot.vla.data.dataset.ModalityConfig
250
+ delta_indices:
251
+ - 0
252
+ - 1
253
+ - 2
254
+ - 3
255
+ - 4
256
+ - 5
257
+ - 6
258
+ - 7
259
+ - 8
260
+ - 9
261
+ - 10
262
+ - 11
263
+ - 12
264
+ - 13
265
+ - 14
266
+ - 15
267
+ - 16
268
+ - 17
269
+ - 18
270
+ - 19
271
+ - 20
272
+ - 21
273
+ - 22
274
+ - 23
275
+ modality_keys:
276
+ - action.left_arm_joint_position
277
+ - action.right_arm_joint_position
278
+ - action.left_effector_position
279
+ - action.right_effector_position
280
+ - action.head_position
281
+ - action.waist_position
282
+ - action.robot_velocity
283
+ language:
284
+ _target_: groot.vla.data.dataset.ModalityConfig
285
+ delta_indices:
286
+ - 0
287
+ modality_keys:
288
+ - annotation.language.action_text
289
+ yam:
290
+ video:
291
+ _target_: groot.vla.data.dataset.ModalityConfig
292
+ delta_indices:
293
+ - 0
294
+ - 1
295
+ - 2
296
+ - 3
297
+ - 4
298
+ - 5
299
+ - 6
300
+ - 7
301
+ - 8
302
+ - 9
303
+ - 10
304
+ - 11
305
+ - 12
306
+ - 13
307
+ - 14
308
+ - 15
309
+ - 16
310
+ - 17
311
+ - 18
312
+ - 19
313
+ - 20
314
+ - 21
315
+ - 22
316
+ - 23
317
+ - 24
318
+ eval_delta_indices:
319
+ - 0
320
+ modality_keys:
321
+ - video.top_camera-images-rgb
322
+ - video.left_camera-images-rgb
323
+ - video.right_camera-images-rgb
324
+ state:
325
+ _target_: groot.vla.data.dataset.ModalityConfig
326
+ delta_indices:
327
+ - 0
328
+ modality_keys:
329
+ - state.left_joint_pos
330
+ - state.left_gripper_pos
331
+ - state.right_joint_pos
332
+ - state.right_gripper_pos
333
+ action:
334
+ _target_: groot.vla.data.dataset.ModalityConfig
335
+ delta_indices:
336
+ - 0
337
+ - 1
338
+ - 2
339
+ - 3
340
+ - 4
341
+ - 5
342
+ - 6
343
+ - 7
344
+ - 8
345
+ - 9
346
+ - 10
347
+ - 11
348
+ - 12
349
+ - 13
350
+ - 14
351
+ - 15
352
+ - 16
353
+ - 17
354
+ - 18
355
+ - 19
356
+ - 20
357
+ - 21
358
+ - 22
359
+ - 23
360
+ modality_keys:
361
+ - action.left_joint_pos
362
+ - action.left_gripper_pos
363
+ - action.right_joint_pos
364
+ - action.right_gripper_pos
365
+ language:
366
+ _target_: groot.vla.data.dataset.ModalityConfig
367
+ delta_indices:
368
+ - 0
369
+ modality_keys:
370
+ - annotation.task
371
+ real_teleop_g1:
372
+ video:
373
+ _target_: groot.vla.data.dataset.ModalityConfig
374
+ delta_indices:
375
+ - 0
376
+ - 1
377
+ - 2
378
+ - 3
379
+ - 4
380
+ - 5
381
+ - 6
382
+ - 7
383
+ - 8
384
+ - 9
385
+ - 10
386
+ - 11
387
+ - 12
388
+ - 13
389
+ - 14
390
+ - 15
391
+ - 16
392
+ - 17
393
+ - 18
394
+ - 19
395
+ - 20
396
+ - 21
397
+ - 22
398
+ - 23
399
+ - 24
400
+ eval_delta_indices:
401
+ - 0
402
+ modality_keys:
403
+ - video.egocentric
404
+ state:
405
+ _target_: groot.vla.data.dataset.ModalityConfig
406
+ delta_indices:
407
+ - 0
408
+ modality_keys:
409
+ - state.left_hand
410
+ - state.right_hand
411
+ - state.left_arm
412
+ - state.right_arm
413
+ - state.rpy
414
+ - state.height
415
+ action:
416
+ _target_: groot.vla.data.dataset.ModalityConfig
417
+ delta_indices:
418
+ - 0
419
+ - 1
420
+ - 2
421
+ - 3
422
+ - 4
423
+ - 5
424
+ - 6
425
+ - 7
426
+ - 8
427
+ - 9
428
+ - 10
429
+ - 11
430
+ - 12
431
+ - 13
432
+ - 14
433
+ - 15
434
+ - 16
435
+ - 17
436
+ - 18
437
+ - 19
438
+ - 20
439
+ - 21
440
+ - 22
441
+ - 23
442
+ modality_keys:
443
+ - action.left_hand
444
+ - action.right_hand
445
+ - action.left_arm
446
+ - action.right_arm
447
+ - action.rpy
448
+ - action.height
449
+ - action.torso_vx
450
+ - action.torso_vy
451
+ - action.torso_vyaw
452
+ - action.torso_dyaw
453
+ language:
454
+ _target_: groot.vla.data.dataset.ModalityConfig
455
+ delta_indices:
456
+ - 0
457
+ modality_keys:
458
+ - annotation.language.language_instruction
459
+ all_transforms:
460
+ oxe_droid:
461
+ _target_: groot.vla.data.transform.ComposedModalityTransform
462
+ transforms:
463
+ - _target_: groot.vla.data.transform.VideoToTensor
464
+ apply_to:
465
+ - video.exterior_image_1_left
466
+ - video.exterior_image_2_left
467
+ - video.wrist_image_left
468
+ - _target_: groot.vla.data.transform.VideoCrop
469
+ apply_to:
470
+ - video.exterior_image_1_left
471
+ - video.exterior_image_2_left
472
+ - video.wrist_image_left
473
+ scale: 0.95
474
+ mode: random
475
+ - _target_: groot.vla.data.transform.VideoResize
476
+ apply_to:
477
+ - video.exterior_image_1_left
478
+ - video.exterior_image_2_left
479
+ - video.wrist_image_left
480
+ height: 176
481
+ width: 320
482
+ interpolation: linear
483
+ - _target_: groot.vla.data.transform.VideoColorJitter
484
+ apply_to:
485
+ - video.exterior_image_1_left
486
+ - video.exterior_image_2_left
487
+ - video.wrist_image_left
488
+ brightness: 0.3
489
+ contrast: 0.4
490
+ saturation: 0.5
491
+ hue: 0.08
492
+ - _target_: groot.vla.data.transform.VideoToNumpy
493
+ apply_to:
494
+ - video.exterior_image_1_left
495
+ - video.exterior_image_2_left
496
+ - video.wrist_image_left
497
+ - _target_: groot.vla.data.transform.StateActionToTensor
498
+ apply_to:
499
+ - state.joint_position
500
+ - state.gripper_position
501
+ - _target_: groot.vla.data.transform.StateActionTransform
502
+ apply_to:
503
+ - state.joint_position
504
+ - state.gripper_position
505
+ normalization_modes:
506
+ state.joint_position: q99
507
+ state.gripper_position: q99
508
+ - _target_: groot.vla.data.transform.StateActionToTensor
509
+ apply_to:
510
+ - action.joint_position
511
+ - action.gripper_position
512
+ - _target_: groot.vla.data.transform.StateActionTransform
513
+ apply_to:
514
+ - action.joint_position
515
+ - action.gripper_position
516
+ normalization_modes:
517
+ action.joint_position: q99
518
+ action.gripper_position: q99
519
+ - _target_: groot.vla.data.transform.ConcatTransform
520
+ video_concat_order:
521
+ - video.exterior_image_1_left
522
+ - video.exterior_image_2_left
523
+ - video.wrist_image_left
524
+ state_concat_order:
525
+ - state.joint_position
526
+ - state.gripper_position
527
+ action_concat_order:
528
+ - action.joint_position
529
+ - action.gripper_position
530
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
531
+ default_instruction: Perform the default behavior.
532
+ language_dropout_prob: 0.0
533
+ always_use_default_instruction: false
534
+ max_state_dim: 64
535
+ max_action_dim: 36
536
+ max_length: 512
537
+ state_horizon: 1
538
+ action_horizon: 24
539
+ embodiment_tag_mapping:
540
+ real_gr1_arms_only: 0
541
+ real_gr1_arms_only_annotated: 1
542
+ real_gr1_arms_waist: 2
543
+ real_gr1_arms_waist_annotated: 3
544
+ dexmg_gr1_arms_only_inspire: 4
545
+ dexmg_gr1_arms_only_fourier: 5
546
+ dexmg_gr1_arms_waist_fourier: 6
547
+ robocasa_single_arm: 7
548
+ onex_eve_gripper: 8
549
+ robocasa_gr1_arms_only_inspire_hands: 9
550
+ robocasa_gr1_arms_only_fourier_hands: 10
551
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
552
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
553
+ robocasa_panda_omron: 13
554
+ robocasa_bimanual_panda_parallel_gripper: 15
555
+ robocasa_bimanual_panda_inspire_hand: 16
556
+ oxe_droid: 17
557
+ oxe_fractal: 18
558
+ oxe_language_table: 19
559
+ oxe_bridge: 20
560
+ real_panda_single_arm: 21
561
+ hot3d_hands_only: 23
562
+ gr1_unified: 24
563
+ robocasa_gr1_arms_waist_fourier_hands: 25
564
+ agibot: 26
565
+ lapa: 27
566
+ oxe_mutex: 28
567
+ oxe_roboset: 29
568
+ oxe_plex: 30
569
+ dream: 31
570
+ yam: 32
571
+ xdof: 22
572
+ gr1_unified_segmentation: 14
573
+ language_table_sim: 7
574
+ gr1_isaac: 0
575
+ sim_behavior_r1_pro: 31
576
+ mecka_hands: 27
577
+ real_r1_pro_sharpa: 28
578
+ real_teleop_g1: 33
579
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
580
+ agibot:
581
+ _target_: groot.vla.data.transform.ComposedModalityTransform
582
+ transforms:
583
+ - _target_: groot.vla.data.transform.VideoToTensor
584
+ apply_to:
585
+ - video.top_head
586
+ - video.hand_left
587
+ - video.hand_right
588
+ - _target_: groot.vla.data.transform.VideoCrop
589
+ apply_to:
590
+ - video.top_head
591
+ - video.hand_left
592
+ - video.hand_right
593
+ scale: 0.95
594
+ mode: random
595
+ - _target_: groot.vla.data.transform.VideoResize
596
+ apply_to:
597
+ - video.top_head
598
+ - video.hand_left
599
+ - video.hand_right
600
+ height: 176
601
+ width: 320
602
+ interpolation: linear
603
+ - _target_: groot.vla.data.transform.VideoColorJitter
604
+ apply_to:
605
+ - video.top_head
606
+ - video.hand_left
607
+ - video.hand_right
608
+ brightness: 0.3
609
+ contrast: 0.4
610
+ saturation: 0.5
611
+ hue: 0.08
612
+ - _target_: groot.vla.data.transform.VideoToNumpy
613
+ apply_to:
614
+ - video.top_head
615
+ - video.hand_left
616
+ - video.hand_right
617
+ - _target_: groot.vla.data.transform.StateActionToTensor
618
+ apply_to:
619
+ - state.left_arm_joint_position
620
+ - state.right_arm_joint_position
621
+ - state.left_effector_position
622
+ - state.right_effector_position
623
+ - state.head_position
624
+ - state.waist_position
625
+ - _target_: groot.vla.data.transform.StateActionTransform
626
+ apply_to:
627
+ - state.left_arm_joint_position
628
+ - state.right_arm_joint_position
629
+ - state.left_effector_position
630
+ - state.right_effector_position
631
+ - state.head_position
632
+ - state.waist_position
633
+ normalization_modes:
634
+ state.left_arm_joint_position: q99
635
+ state.right_arm_joint_position: q99
636
+ state.left_effector_position: q99
637
+ state.right_effector_position: q99
638
+ state.head_position: q99
639
+ state.waist_position: q99
640
+ - _target_: groot.vla.data.transform.StateActionToTensor
641
+ apply_to:
642
+ - action.left_arm_joint_position
643
+ - action.right_arm_joint_position
644
+ - action.left_effector_position
645
+ - action.right_effector_position
646
+ - action.head_position
647
+ - action.waist_position
648
+ - action.robot_velocity
649
+ - _target_: groot.vla.data.transform.StateActionTransform
650
+ apply_to:
651
+ - action.left_arm_joint_position
652
+ - action.right_arm_joint_position
653
+ - action.left_effector_position
654
+ - action.right_effector_position
655
+ - action.head_position
656
+ - action.waist_position
657
+ - action.robot_velocity
658
+ normalization_modes:
659
+ action.left_arm_joint_position: q99
660
+ action.right_arm_joint_position: q99
661
+ action.left_effector_position: q99
662
+ action.right_effector_position: q99
663
+ action.head_position: q99
664
+ action.waist_position: q99
665
+ action.robot_velocity: q99
666
+ - _target_: groot.vla.data.transform.ConcatTransform
667
+ video_concat_order:
668
+ - video.top_head
669
+ - video.hand_left
670
+ - video.hand_right
671
+ state_concat_order:
672
+ - state.left_arm_joint_position
673
+ - state.right_arm_joint_position
674
+ - state.left_effector_position
675
+ - state.right_effector_position
676
+ - state.head_position
677
+ - state.waist_position
678
+ action_concat_order:
679
+ - action.left_arm_joint_position
680
+ - action.right_arm_joint_position
681
+ - action.left_effector_position
682
+ - action.right_effector_position
683
+ - action.head_position
684
+ - action.waist_position
685
+ - action.robot_velocity
686
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
687
+ default_instruction: Perform the default behavior.
688
+ language_dropout_prob: 0.0
689
+ always_use_default_instruction: false
690
+ max_state_dim: 64
691
+ max_action_dim: 36
692
+ max_length: 512
693
+ state_horizon: 1
694
+ action_horizon: 24
695
+ embodiment_tag_mapping:
696
+ real_gr1_arms_only: 0
697
+ real_gr1_arms_only_annotated: 1
698
+ real_gr1_arms_waist: 2
699
+ real_gr1_arms_waist_annotated: 3
700
+ dexmg_gr1_arms_only_inspire: 4
701
+ dexmg_gr1_arms_only_fourier: 5
702
+ dexmg_gr1_arms_waist_fourier: 6
703
+ robocasa_single_arm: 7
704
+ onex_eve_gripper: 8
705
+ robocasa_gr1_arms_only_inspire_hands: 9
706
+ robocasa_gr1_arms_only_fourier_hands: 10
707
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
708
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
709
+ robocasa_panda_omron: 13
710
+ robocasa_bimanual_panda_parallel_gripper: 15
711
+ robocasa_bimanual_panda_inspire_hand: 16
712
+ oxe_droid: 17
713
+ oxe_fractal: 18
714
+ oxe_language_table: 19
715
+ oxe_bridge: 20
716
+ real_panda_single_arm: 21
717
+ hot3d_hands_only: 23
718
+ gr1_unified: 24
719
+ robocasa_gr1_arms_waist_fourier_hands: 25
720
+ agibot: 26
721
+ lapa: 27
722
+ oxe_mutex: 28
723
+ oxe_roboset: 29
724
+ oxe_plex: 30
725
+ dream: 31
726
+ yam: 32
727
+ xdof: 22
728
+ gr1_unified_segmentation: 14
729
+ language_table_sim: 7
730
+ gr1_isaac: 0
731
+ sim_behavior_r1_pro: 31
732
+ mecka_hands: 27
733
+ real_r1_pro_sharpa: 28
734
+ real_teleop_g1: 33
735
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
736
+ yam:
737
+ _target_: groot.vla.data.transform.ComposedModalityTransform
738
+ transforms:
739
+ - _target_: groot.vla.data.transform.VideoToTensor
740
+ apply_to:
741
+ - video.top_camera-images-rgb
742
+ - video.left_camera-images-rgb
743
+ - video.right_camera-images-rgb
744
+ - _target_: groot.vla.data.transform.VideoCrop
745
+ apply_to:
746
+ - video.top_camera-images-rgb
747
+ - video.left_camera-images-rgb
748
+ - video.right_camera-images-rgb
749
+ scale: 0.95
750
+ mode: random
751
+ - _target_: groot.vla.data.transform.VideoResize
752
+ apply_to:
753
+ - video.top_camera-images-rgb
754
+ - video.left_camera-images-rgb
755
+ - video.right_camera-images-rgb
756
+ height: 176
757
+ width: 320
758
+ interpolation: linear
759
+ - _target_: groot.vla.data.transform.VideoColorJitter
760
+ apply_to:
761
+ - video.top_camera-images-rgb
762
+ - video.left_camera-images-rgb
763
+ - video.right_camera-images-rgb
764
+ brightness: 0.3
765
+ contrast: 0.4
766
+ saturation: 0.5
767
+ hue: 0.08
768
+ - _target_: groot.vla.data.transform.VideoToNumpy
769
+ apply_to:
770
+ - video.top_camera-images-rgb
771
+ - video.left_camera-images-rgb
772
+ - video.right_camera-images-rgb
773
+ - _target_: groot.vla.data.transform.StateActionToTensor
774
+ apply_to:
775
+ - state.left_joint_pos
776
+ - state.left_gripper_pos
777
+ - state.right_joint_pos
778
+ - state.right_gripper_pos
779
+ - _target_: groot.vla.data.transform.StateActionTransform
780
+ apply_to:
781
+ - state.left_joint_pos
782
+ - state.left_gripper_pos
783
+ - state.right_joint_pos
784
+ - state.right_gripper_pos
785
+ normalization_modes:
786
+ state.left_joint_pos: q99
787
+ state.left_gripper_pos: q99
788
+ state.right_joint_pos: q99
789
+ state.right_gripper_pos: q99
790
+ - _target_: groot.vla.data.transform.StateActionToTensor
791
+ apply_to:
792
+ - action.left_joint_pos
793
+ - action.left_gripper_pos
794
+ - action.right_joint_pos
795
+ - action.right_gripper_pos
796
+ - _target_: groot.vla.data.transform.StateActionTransform
797
+ apply_to:
798
+ - action.left_joint_pos
799
+ - action.left_gripper_pos
800
+ - action.right_joint_pos
801
+ - action.right_gripper_pos
802
+ normalization_modes:
803
+ action.left_joint_pos: q99
804
+ action.left_gripper_pos: q99
805
+ action.right_joint_pos: q99
806
+ action.right_gripper_pos: q99
807
+ - _target_: groot.vla.data.transform.ConcatTransform
808
+ video_concat_order:
809
+ - video.top_camera-images-rgb
810
+ - video.left_camera-images-rgb
811
+ - video.right_camera-images-rgb
812
+ state_concat_order:
813
+ - state.left_joint_pos
814
+ - state.left_gripper_pos
815
+ - state.right_joint_pos
816
+ - state.right_gripper_pos
817
+ action_concat_order:
818
+ - action.left_joint_pos
819
+ - action.left_gripper_pos
820
+ - action.right_joint_pos
821
+ - action.right_gripper_pos
822
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
823
+ default_instruction: Perform the default behavior.
824
+ language_dropout_prob: 0.0
825
+ always_use_default_instruction: false
826
+ max_state_dim: 64
827
+ max_action_dim: 36
828
+ max_length: 512
829
+ state_horizon: 1
830
+ action_horizon: 24
831
+ embodiment_tag_mapping:
832
+ real_gr1_arms_only: 0
833
+ real_gr1_arms_only_annotated: 1
834
+ real_gr1_arms_waist: 2
835
+ real_gr1_arms_waist_annotated: 3
836
+ dexmg_gr1_arms_only_inspire: 4
837
+ dexmg_gr1_arms_only_fourier: 5
838
+ dexmg_gr1_arms_waist_fourier: 6
839
+ robocasa_single_arm: 7
840
+ onex_eve_gripper: 8
841
+ robocasa_gr1_arms_only_inspire_hands: 9
842
+ robocasa_gr1_arms_only_fourier_hands: 10
843
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
844
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
845
+ robocasa_panda_omron: 13
846
+ robocasa_bimanual_panda_parallel_gripper: 15
847
+ robocasa_bimanual_panda_inspire_hand: 16
848
+ oxe_droid: 17
849
+ oxe_fractal: 18
850
+ oxe_language_table: 19
851
+ oxe_bridge: 20
852
+ real_panda_single_arm: 21
853
+ hot3d_hands_only: 23
854
+ gr1_unified: 24
855
+ robocasa_gr1_arms_waist_fourier_hands: 25
856
+ agibot: 26
857
+ lapa: 27
858
+ oxe_mutex: 28
859
+ oxe_roboset: 29
860
+ oxe_plex: 30
861
+ dream: 31
862
+ yam: 32
863
+ xdof: 22
864
+ gr1_unified_segmentation: 14
865
+ language_table_sim: 7
866
+ gr1_isaac: 0
867
+ sim_behavior_r1_pro: 31
868
+ mecka_hands: 27
869
+ real_r1_pro_sharpa: 28
870
+ real_teleop_g1: 33
871
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
872
+ real_teleop_g1:
873
+ _target_: groot.vla.data.transform.ComposedModalityTransform
874
+ transforms:
875
+ - _target_: groot.vla.data.transform.VideoToTensor
876
+ apply_to:
877
+ - video.egocentric
878
+ - _target_: groot.vla.data.transform.VideoCrop
879
+ apply_to:
880
+ - video.egocentric
881
+ scale: 0.95
882
+ mode: random
883
+ - _target_: groot.vla.data.transform.VideoResize
884
+ apply_to:
885
+ - video.egocentric
886
+ height: 176
887
+ width: 320
888
+ interpolation: linear
889
+ - _target_: groot.vla.data.transform.VideoColorJitter
890
+ apply_to:
891
+ - video.egocentric
892
+ brightness: 0.3
893
+ contrast: 0.4
894
+ saturation: 0.5
895
+ hue: 0.08
896
+ - _target_: groot.vla.data.transform.VideoToNumpy
897
+ apply_to:
898
+ - video.egocentric
899
+ - _target_: groot.vla.data.transform.StateActionToTensor
900
+ apply_to:
901
+ - state.left_hand
902
+ - state.right_hand
903
+ - state.left_arm
904
+ - state.right_arm
905
+ - state.rpy
906
+ - state.height
907
+ - _target_: groot.vla.data.transform.StateActionTransform
908
+ apply_to:
909
+ - state.left_hand
910
+ - state.right_hand
911
+ - state.left_arm
912
+ - state.right_arm
913
+ - state.rpy
914
+ - state.height
915
+ normalization_modes:
916
+ state.left_hand: q99
917
+ state.right_hand: q99
918
+ state.left_arm: q99
919
+ state.right_arm: q99
920
+ state.rpy: q99
921
+ state.height: q99
922
+ - _target_: groot.vla.data.transform.StateActionToTensor
923
+ apply_to:
924
+ - action.left_hand
925
+ - action.right_hand
926
+ - action.left_arm
927
+ - action.right_arm
928
+ - action.rpy
929
+ - action.height
930
+ - action.torso_vx
931
+ - action.torso_vy
932
+ - action.torso_vyaw
933
+ - action.torso_dyaw
934
+ - _target_: groot.vla.data.transform.StateActionTransform
935
+ apply_to:
936
+ - action.left_hand
937
+ - action.right_hand
938
+ - action.left_arm
939
+ - action.right_arm
940
+ - action.rpy
941
+ - action.height
942
+ - action.torso_vx
943
+ - action.torso_vy
944
+ - action.torso_vyaw
945
+ - action.torso_dyaw
946
+ normalization_modes:
947
+ action.left_hand: q99
948
+ action.right_hand: q99
949
+ action.left_arm: q99
950
+ action.right_arm: q99
951
+ action.rpy: q99
952
+ action.height: q99
953
+ action.torso_vx: q99
954
+ action.torso_vy: q99
955
+ action.torso_vyaw: q99
956
+ action.torso_dyaw: q99
957
+ - _target_: groot.vla.data.transform.ConcatTransform
958
+ video_concat_order:
959
+ - video.egocentric
960
+ state_concat_order:
961
+ - state.left_hand
962
+ - state.right_hand
963
+ - state.left_arm
964
+ - state.right_arm
965
+ - state.rpy
966
+ - state.height
967
+ action_concat_order:
968
+ - action.left_hand
969
+ - action.right_hand
970
+ - action.left_arm
971
+ - action.right_arm
972
+ - action.rpy
973
+ - action.height
974
+ - action.torso_vx
975
+ - action.torso_vy
976
+ - action.torso_vyaw
977
+ - action.torso_dyaw
978
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
979
+ default_instruction: Perform the default behavior.
980
+ language_dropout_prob: 0.0
981
+ always_use_default_instruction: false
982
+ max_state_dim: 64
983
+ max_action_dim: 36
984
+ max_length: 512
985
+ state_horizon: 1
986
+ action_horizon: 24
987
+ embodiment_tag_mapping:
988
+ real_gr1_arms_only: 0
989
+ real_gr1_arms_only_annotated: 1
990
+ real_gr1_arms_waist: 2
991
+ real_gr1_arms_waist_annotated: 3
992
+ dexmg_gr1_arms_only_inspire: 4
993
+ dexmg_gr1_arms_only_fourier: 5
994
+ dexmg_gr1_arms_waist_fourier: 6
995
+ robocasa_single_arm: 7
996
+ onex_eve_gripper: 8
997
+ robocasa_gr1_arms_only_inspire_hands: 9
998
+ robocasa_gr1_arms_only_fourier_hands: 10
999
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1000
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1001
+ robocasa_panda_omron: 13
1002
+ robocasa_bimanual_panda_parallel_gripper: 15
1003
+ robocasa_bimanual_panda_inspire_hand: 16
1004
+ oxe_droid: 17
1005
+ oxe_fractal: 18
1006
+ oxe_language_table: 19
1007
+ oxe_bridge: 20
1008
+ real_panda_single_arm: 21
1009
+ hot3d_hands_only: 23
1010
+ gr1_unified: 24
1011
+ robocasa_gr1_arms_waist_fourier_hands: 25
1012
+ agibot: 26
1013
+ lapa: 27
1014
+ oxe_mutex: 28
1015
+ oxe_roboset: 29
1016
+ oxe_plex: 30
1017
+ dream: 31
1018
+ yam: 32
1019
+ xdof: 22
1020
+ gr1_unified_segmentation: 14
1021
+ language_table_sim: 7
1022
+ gr1_isaac: 0
1023
+ sim_behavior_r1_pro: 31
1024
+ mecka_hands: 27
1025
+ real_r1_pro_sharpa: 28
1026
+ real_teleop_g1: 33
1027
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1028
+ metadata_versions:
1029
+ oxe_droid: '0221'
1030
+ agibot: '0221'
1031
+ yam: '0221'
1032
+ real_teleop_g1: '0221'
1033
+ fps:
1034
+ yam: 30
1035
+ real_teleop_g1: 30
1036
+ dataset_kwargs:
1037
+ video_backend: decord
1038
+ use_global_metadata: false
1039
+ max_chunk_size: 4
1040
+ relative_action: true
1041
+ relative_action_keys:
1042
+ - left_hand
1043
+ - right_hand
1044
+ - left_arm
1045
+ - right_arm
1046
+ - rpy
1047
+ - height
1048
+ relative_action_per_horizon: false
1049
+ mixture_kwargs:
1050
+ training: true
1051
+ balance_dataset_weights: false
1052
+ seed: 42
1053
+ shard_sampling_rate: 0.1
1054
+ trainer:
1055
+ _target_: groot.vla.experiment.VLATrainer
1056
+ _partial_: true
1057
+ _recursive_: false
1058
+ callbacks: null
1059
+ model: ???
1060
+ train_dataset: ???
1061
+ compute_dtype: ???
1062
+ benchmark_time: false
1063
+ enable_profiling: false
1064
+ profiling_steps: 5
1065
+ enable_prof_callback: false
1066
+ profile_start_step: 50
1067
+ profile_warmup_steps: 1
1068
+ profile_active_steps: 3
1069
+ profile_record_shapes: false
1070
+ profile_with_stack: false
1071
+ profile_memory: false
1072
+ wandb_project: dreamzero
1073
+ output_dir: ./checkpoints/dreamzero_real_teleop_g1_full_finetune
1074
+ load_from_yaml: null
1075
+ gear_credentials: null
1076
+ upload_checkpoints: false
1077
+ upload_every: 1000
1078
+ upload_last_n_checkpoints: 5
1079
+ remove_unused_columns: false
1080
+ bf16: true
1081
+ tf32: true
1082
+ global_batch_size: null
1083
+ raise_error_if_global_batch_size_not_set: false
1084
+ per_device_train_batch_size: 1
1085
+ per_device_eval_batch_size: 64
1086
+ gradient_accumulation_steps: 1
1087
+ dataloader_num_workers: 1
1088
+ dataloader_pin_memory: false
1089
+ dataloader_persistent_workers: true
1090
+ optim: adamw_torch
1091
+ learning_rate: 0.0001
1092
+ adam_beta1: 0.95
1093
+ adam_beta2: 0.999
1094
+ adam_epsilon: 1.0e-08
1095
+ weight_decay: 1.0e-05
1096
+ lr_scheduler_type: cosine
1097
+ warmup_ratio: 0.05
1098
+ logging_steps: 10.0
1099
+ num_train_epochs: 1000
1100
+ max_steps: 20000
1101
+ save_strategy: steps
1102
+ save_steps: 8000
1103
+ eval_strategy: 'no'
1104
+ save_total_limit: 10
1105
+ report_to: wandb
1106
+ seed: 42
1107
+ do_eval: false
1108
+ gradient_checkpointing: false
1109
+ ddp_find_unused_parameters: false
1110
+ ddp_bucket_cap_mb: 100
1111
+ ray_num_workers: ???
1112
+ eval_bf16: true
1113
+ torch_compile_mode: null
1114
+ pretrained_model_path: /hfm/boqian/liboqian_data/checkpoints/DreamZero-AgiBot
1115
+ only_tune_projectors: false
1116
+ save_llm: false
1117
+ save_lora_only: false
1118
+ save_value_model: false
1119
+ save_q_model: false
1120
+ download_cache: false
1121
+ training_args:
1122
+ _target_: transformers.TrainingArguments
1123
+ output_dir: ./checkpoints/dreamzero_real_teleop_g1_full_finetune
1124
+ run_name: dreamzero_real_teleop_g1_full_finetune
1125
+ remove_unused_columns: false
1126
+ deepspeed: groot/vla/configs/deepspeed/zero2_offload.json
1127
+ gradient_checkpointing: false
1128
+ bf16: true
1129
+ tf32: true
1130
+ per_device_train_batch_size: 1
1131
+ per_device_eval_batch_size: 64
1132
+ gradient_accumulation_steps: 1
1133
+ dataloader_num_workers: 1
1134
+ dataloader_pin_memory: false
1135
+ dataloader_persistent_workers: true
1136
+ optim: adamw_torch
1137
+ adam_beta1: 0.95
1138
+ adam_beta2: 0.999
1139
+ adam_epsilon: 1.0e-08
1140
+ learning_rate: 1.0e-05
1141
+ weight_decay: 1.0e-05
1142
+ warmup_ratio: 0.05
1143
+ lr_scheduler_type: cosine
1144
+ logging_steps: 10.0
1145
+ num_train_epochs: 1000
1146
+ max_steps: 20000
1147
+ save_strategy: steps
1148
+ save_steps: 8000
1149
+ save_total_limit: 10
1150
+ report_to: wandb
1151
+ seed: 42
1152
+ do_eval: false
1153
+ ddp_find_unused_parameters: false
1154
+ ddp_bucket_cap_mb: 100
1155
+ torch_compile_mode: null
1156
+ profile_dir: null
1157
+ backbone_hidden_size: 0
1158
+ backbone_cfg:
1159
+ _target_: groot.vla.model.dreamzero.backbone.identity.IdentityBackbone
1160
+ action_head_cfg:
1161
+ config:
1162
+ backbone_features_projector_cfg: null
1163
+ _target_: groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHeadConfig
1164
+ _recursive_: false
1165
+ tiled: false
1166
+ tile_size_height: 34
1167
+ tile_size_width: 34
1168
+ tile_stride_height: 18
1169
+ tile_stride_width: 16
1170
+ lora_rank: 4
1171
+ lora_alpha: 4
1172
+ num_frames: 33
1173
+ num_frame_per_block: 2
1174
+ lora_target_modules: q,k,v,o,ffn.0,ffn.2
1175
+ init_lora_weights: kaiming
1176
+ train_architecture: full
1177
+ use_gradient_checkpointing: true
1178
+ add_pos_embed: true
1179
+ model_dtype: float32
1180
+ max_state_dim: 64
1181
+ max_action_dim: 36
1182
+ action_loss_embodiment_ids:
1183
+ - 26
1184
+ - 17
1185
+ - 32
1186
+ hidden_size: 64
1187
+ input_embedding_dim: 1536
1188
+ backbone_embedding_dim: 0
1189
+ repa_layer: 8
1190
+ repa_coeff: 1.0
1191
+ load_pretrained_det_decode_layer_path: null
1192
+ freeze_decode_layer: false
1193
+ expand_batch: null
1194
+ use_vlln: true
1195
+ vl_self_attention_cfg:
1196
+ _target_: groot.vla.model.n1_5.modules.cross_attention_dit.SelfAttentionTransformer
1197
+ positional_embeddings: null
1198
+ num_layers: 4
1199
+ num_attention_heads: 24
1200
+ attention_head_dim: 64
1201
+ dropout: 0.2
1202
+ final_dropout: true
1203
+ diffusion_model_cfg:
1204
+ _target_: groot.vla.model.dreamzero.modules.wan_video_dit_action_casual_chunk.CausalWanModel
1205
+ _convert_: object
1206
+ diffusion_model_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P
1207
+ model_type: i2v
1208
+ frame_seqlen: 220
1209
+ dim: 5120
1210
+ in_dim: 36
1211
+ ffn_dim: 13824
1212
+ out_dim: 16
1213
+ freq_dim: 256
1214
+ eps: 1.0e-06
1215
+ num_heads: 40
1216
+ num_layers: 40
1217
+ max_chunk_size: 4
1218
+ num_frame_per_block: 2
1219
+ num_action_per_block: 24
1220
+ num_state_per_block: 1
1221
+ action_dim: 36
1222
+ text_encoder_cfg:
1223
+ _target_: groot.vla.model.dreamzero.modules.wan_video_text_encoder.WanTextEncoder
1224
+ _convert_: object
1225
+ text_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth
1226
+ image_encoder_cfg:
1227
+ _target_: groot.vla.model.dreamzero.modules.wan_video_image_encoder.WanImageEncoder
1228
+ _convert_: object
1229
+ image_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth
1230
+ vae_cfg:
1231
+ _target_: groot.vla.model.dreamzero.modules.wan_video_vae.WanVideoVAE
1232
+ _convert_: object
1233
+ vae_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/Wan2.1_VAE.pth
1234
+ action_dim: 36
1235
+ action_horizon: 24
1236
+ num_inference_timesteps: 4
1237
+ noise_beta_alpha: 1.5
1238
+ noise_beta_beta: 1.0
1239
+ noise_s: 0.999
1240
+ num_timestep_buckets: 1000
1241
+ decouple_video_action_noise: false
1242
+ video_noise_beta_alpha: 3.0
1243
+ video_noise_beta_beta: 1.0
1244
+ tune_projector: true
1245
+ tune_diffusion_model: true
1246
+ skip_component_loading: true
1247
+ _target_: groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead
1248
+ _convert_: object
1249
+ add_pos_embed: true
1250
+ hidden_size: 64
1251
+ attn_dropout: 0.2
1252
+ repa_layer: 8
1253
+ repa_coeff: 1.0
1254
+ load_pretrained_det_decode_layer_path: null
1255
+ expand_batch: null
1256
+ dit_version: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P
1257
+ text_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth
1258
+ image_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth
1259
+ vae_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/Wan2.1_VAE.pth
1260
+ train_architecture: full
1261
+ num_frame_per_block: 2
1262
+ num_action_per_block: 24
1263
+ num_state_per_block: 1
1264
+ frame_seqlen: 220
1265
+ embodiment_tag_to_projector_index:
1266
+ real_gr1_arms_only: 0
1267
+ real_gr1_arms_only_annotated: 1
1268
+ real_gr1_arms_waist: 2
1269
+ real_gr1_arms_waist_annotated: 3
1270
+ dexmg_gr1_arms_only_inspire: 4
1271
+ dexmg_gr1_arms_only_fourier: 5
1272
+ dexmg_gr1_arms_waist_fourier: 6
1273
+ robocasa_single_arm: 7
1274
+ onex_eve_gripper: 8
1275
+ robocasa_gr1_arms_only_inspire_hands: 9
1276
+ robocasa_gr1_arms_only_fourier_hands: 10
1277
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1278
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1279
+ robocasa_panda_omron: 13
1280
+ robocasa_bimanual_panda_parallel_gripper: 15
1281
+ robocasa_bimanual_panda_inspire_hand: 16
1282
+ oxe_droid: 17
1283
+ oxe_fractal: 18
1284
+ oxe_language_table: 19
1285
+ oxe_bridge: 20
1286
+ real_panda_single_arm: 21
1287
+ hot3d_hands_only: 23
1288
+ gr1_unified: 24
1289
+ robocasa_gr1_arms_waist_fourier_hands: 25
1290
+ agibot: 26
1291
+ lapa: 27
1292
+ oxe_mutex: 28
1293
+ oxe_roboset: 29
1294
+ oxe_plex: 30
1295
+ dream: 31
1296
+ yam: 32
1297
+ xdof: 22
1298
+ gr1_unified_segmentation: 14
1299
+ language_table_sim: 7
1300
+ gr1_isaac: 0
1301
+ sim_behavior_r1_pro: 31
1302
+ mecka_hands: 27
1303
+ real_r1_pro_sharpa: 28
1304
+ real_teleop_g1: 33
1305
+ max_length: 512
1306
+ num_views: 1
1307
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1308
+ data_collator:
1309
+ _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DefaultDataCollator
1310
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1311
+ max_length: 512
1312
+ num_views: 1
1313
+ embodiment_tag_mapping:
1314
+ real_gr1_arms_only: 0
1315
+ real_gr1_arms_only_annotated: 1
1316
+ real_gr1_arms_waist: 2
1317
+ real_gr1_arms_waist_annotated: 3
1318
+ dexmg_gr1_arms_only_inspire: 4
1319
+ dexmg_gr1_arms_only_fourier: 5
1320
+ dexmg_gr1_arms_waist_fourier: 6
1321
+ robocasa_single_arm: 7
1322
+ onex_eve_gripper: 8
1323
+ robocasa_gr1_arms_only_inspire_hands: 9
1324
+ robocasa_gr1_arms_only_fourier_hands: 10
1325
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1326
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1327
+ robocasa_panda_omron: 13
1328
+ robocasa_bimanual_panda_parallel_gripper: 15
1329
+ robocasa_bimanual_panda_inspire_hand: 16
1330
+ oxe_droid: 17
1331
+ oxe_fractal: 18
1332
+ oxe_language_table: 19
1333
+ oxe_bridge: 20
1334
+ real_panda_single_arm: 21
1335
+ hot3d_hands_only: 23
1336
+ gr1_unified: 24
1337
+ robocasa_gr1_arms_waist_fourier_hands: 25
1338
+ agibot: 26
1339
+ lapa: 27
1340
+ oxe_mutex: 28
1341
+ oxe_roboset: 29
1342
+ oxe_plex: 30
1343
+ dream: 31
1344
+ yam: 32
1345
+ xdof: 22
1346
+ gr1_unified_segmentation: 14
1347
+ language_table_sim: 7
1348
+ gr1_isaac: 0
1349
+ sim_behavior_r1_pro: 31
1350
+ mecka_hands: 27
1351
+ real_r1_pro_sharpa: 28
1352
+ real_teleop_g1: 33
1353
+ num_visual_tokens_per_frame: 16
1354
+ max_state_dim: 64
1355
+ max_action_dim: 36
1356
+ language_dropout_prob: 0.0
1357
+ model_specific_transform:
1358
+ _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
1359
+ default_instruction: Perform the default behavior.
1360
+ language_dropout_prob: 0.0
1361
+ always_use_default_instruction: false
1362
+ max_state_dim: 64
1363
+ max_action_dim: 36
1364
+ max_length: 512
1365
+ state_horizon: 1
1366
+ action_horizon: 24
1367
+ embodiment_tag_mapping:
1368
+ real_gr1_arms_only: 0
1369
+ real_gr1_arms_only_annotated: 1
1370
+ real_gr1_arms_waist: 2
1371
+ real_gr1_arms_waist_annotated: 3
1372
+ dexmg_gr1_arms_only_inspire: 4
1373
+ dexmg_gr1_arms_only_fourier: 5
1374
+ dexmg_gr1_arms_waist_fourier: 6
1375
+ robocasa_single_arm: 7
1376
+ onex_eve_gripper: 8
1377
+ robocasa_gr1_arms_only_inspire_hands: 9
1378
+ robocasa_gr1_arms_only_fourier_hands: 10
1379
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1380
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1381
+ robocasa_panda_omron: 13
1382
+ robocasa_bimanual_panda_parallel_gripper: 15
1383
+ robocasa_bimanual_panda_inspire_hand: 16
1384
+ oxe_droid: 17
1385
+ oxe_fractal: 18
1386
+ oxe_language_table: 19
1387
+ oxe_bridge: 20
1388
+ real_panda_single_arm: 21
1389
+ hot3d_hands_only: 23
1390
+ gr1_unified: 24
1391
+ robocasa_gr1_arms_waist_fourier_hands: 25
1392
+ agibot: 26
1393
+ lapa: 27
1394
+ oxe_mutex: 28
1395
+ oxe_roboset: 29
1396
+ oxe_plex: 30
1397
+ dream: 31
1398
+ yam: 32
1399
+ xdof: 22
1400
+ gr1_unified_segmentation: 14
1401
+ language_table_sim: 7
1402
+ gr1_isaac: 0
1403
+ sim_behavior_r1_pro: 31
1404
+ mecka_hands: 27
1405
+ real_r1_pro_sharpa: 28
1406
+ real_teleop_g1: 33
1407
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1408
+ use_global_metadata: false
1409
+ num_frames: 33
1410
+ action_horizon: 24
1411
+ state_horizon: 1
1412
+ image_resolution_width: 320
1413
+ image_resolution_height: 176
1414
+ image_resolution_width_single_frame: 256
1415
+ image_resolution_height_single_frame: 256
1416
+ totensor_cfg:
1417
+ _target_: groot.vla.data.transform.VideoToTensor
1418
+ apply_to: ???
1419
+ crop_cfg:
1420
+ _target_: groot.vla.data.transform.VideoCrop
1421
+ apply_to: ???
1422
+ scale: 0.95
1423
+ mode: random
1424
+ resize_cfg:
1425
+ _target_: groot.vla.data.transform.VideoResize
1426
+ apply_to: ???
1427
+ height: 176
1428
+ width: 320
1429
+ interpolation: linear
1430
+ resize_cfg_single_frame:
1431
+ _target_: groot.vla.data.transform.VideoResize
1432
+ apply_to: ???
1433
+ height: 256
1434
+ width: 256
1435
+ interpolation: linear
1436
+ color_jitter_cfg:
1437
+ _target_: groot.vla.data.transform.VideoColorJitter
1438
+ apply_to: ???
1439
+ brightness: 0.3
1440
+ contrast: 0.4
1441
+ saturation: 0.5
1442
+ hue: 0.08
1443
+ random_grayscale_cfg:
1444
+ _target_: groot.vla.data.transform.VideoRandomGrayscale
1445
+ apply_to: ???
1446
+ p: 0.1
1447
+ random_posterize_cfg:
1448
+ _target_: groot.vla.data.transform.VideoRandomPosterize
1449
+ apply_to: ???
1450
+ bits: 4
1451
+ p: 0.1
1452
+ normalize_cfg:
1453
+ _target_: groot.vla.data.transform.VideoNormalize
1454
+ apply_to: ???
1455
+ mean:
1456
+ - 0.5
1457
+ - 0.5
1458
+ - 0.5
1459
+ std:
1460
+ - 0.5
1461
+ - 0.5
1462
+ - 0.5
1463
+ to_numpy_cfg:
1464
+ _target_: groot.vla.data.transform.VideoToNumpy
1465
+ apply_to: ???
1466
+ modality_config_oxe_droid:
1467
+ video:
1468
+ _target_: groot.vla.data.dataset.ModalityConfig
1469
+ delta_indices:
1470
+ - 0
1471
+ - 1
1472
+ - 2
1473
+ - 3
1474
+ - 4
1475
+ - 5
1476
+ - 6
1477
+ - 7
1478
+ - 8
1479
+ - 9
1480
+ - 10
1481
+ - 11
1482
+ - 12
1483
+ - 13
1484
+ - 14
1485
+ - 15
1486
+ - 16
1487
+ - 17
1488
+ - 18
1489
+ - 19
1490
+ - 20
1491
+ - 21
1492
+ - 22
1493
+ - 23
1494
+ - 24
1495
+ eval_delta_indices:
1496
+ - 0
1497
+ modality_keys:
1498
+ - video.exterior_image_1_left
1499
+ - video.exterior_image_2_left
1500
+ - video.wrist_image_left
1501
+ state:
1502
+ _target_: groot.vla.data.dataset.ModalityConfig
1503
+ delta_indices:
1504
+ - 0
1505
+ modality_keys:
1506
+ - state.joint_position
1507
+ - state.gripper_position
1508
+ action:
1509
+ _target_: groot.vla.data.dataset.ModalityConfig
1510
+ delta_indices:
1511
+ - 0
1512
+ - 1
1513
+ - 2
1514
+ - 3
1515
+ - 4
1516
+ - 5
1517
+ - 6
1518
+ - 7
1519
+ - 8
1520
+ - 9
1521
+ - 10
1522
+ - 11
1523
+ - 12
1524
+ - 13
1525
+ - 14
1526
+ - 15
1527
+ - 16
1528
+ - 17
1529
+ - 18
1530
+ - 19
1531
+ - 20
1532
+ - 21
1533
+ - 22
1534
+ - 23
1535
+ modality_keys:
1536
+ - action.joint_position
1537
+ - action.gripper_position
1538
+ language:
1539
+ _target_: groot.vla.data.dataset.ModalityConfig
1540
+ delta_indices:
1541
+ - 0
1542
+ modality_keys:
1543
+ - annotation.language.language_instruction
1544
+ - annotation.language.language_instruction_2
1545
+ - annotation.language.language_instruction_3
1546
+ lapa_action:
1547
+ _target_: groot.vla.data.dataset.ModalityConfig
1548
+ delta_indices:
1549
+ - 0
1550
+ modality_keys:
1551
+ - lapa_action
1552
+ transform_oxe_droid:
1553
+ _target_: groot.vla.data.transform.ComposedModalityTransform
1554
+ transforms:
1555
+ - _target_: groot.vla.data.transform.VideoToTensor
1556
+ apply_to:
1557
+ - video.exterior_image_1_left
1558
+ - video.exterior_image_2_left
1559
+ - video.wrist_image_left
1560
+ - _target_: groot.vla.data.transform.VideoCrop
1561
+ apply_to:
1562
+ - video.exterior_image_1_left
1563
+ - video.exterior_image_2_left
1564
+ - video.wrist_image_left
1565
+ scale: 0.95
1566
+ mode: random
1567
+ - _target_: groot.vla.data.transform.VideoResize
1568
+ apply_to:
1569
+ - video.exterior_image_1_left
1570
+ - video.exterior_image_2_left
1571
+ - video.wrist_image_left
1572
+ height: 176
1573
+ width: 320
1574
+ interpolation: linear
1575
+ - _target_: groot.vla.data.transform.VideoColorJitter
1576
+ apply_to:
1577
+ - video.exterior_image_1_left
1578
+ - video.exterior_image_2_left
1579
+ - video.wrist_image_left
1580
+ brightness: 0.3
1581
+ contrast: 0.4
1582
+ saturation: 0.5
1583
+ hue: 0.08
1584
+ - _target_: groot.vla.data.transform.VideoToNumpy
1585
+ apply_to:
1586
+ - video.exterior_image_1_left
1587
+ - video.exterior_image_2_left
1588
+ - video.wrist_image_left
1589
+ - _target_: groot.vla.data.transform.StateActionToTensor
1590
+ apply_to:
1591
+ - state.joint_position
1592
+ - state.gripper_position
1593
+ - _target_: groot.vla.data.transform.StateActionTransform
1594
+ apply_to:
1595
+ - state.joint_position
1596
+ - state.gripper_position
1597
+ normalization_modes:
1598
+ state.joint_position: q99
1599
+ state.gripper_position: q99
1600
+ - _target_: groot.vla.data.transform.StateActionToTensor
1601
+ apply_to:
1602
+ - action.joint_position
1603
+ - action.gripper_position
1604
+ - _target_: groot.vla.data.transform.StateActionTransform
1605
+ apply_to:
1606
+ - action.joint_position
1607
+ - action.gripper_position
1608
+ normalization_modes:
1609
+ action.joint_position: q99
1610
+ action.gripper_position: q99
1611
+ - _target_: groot.vla.data.transform.ConcatTransform
1612
+ video_concat_order:
1613
+ - video.exterior_image_1_left
1614
+ - video.exterior_image_2_left
1615
+ - video.wrist_image_left
1616
+ state_concat_order:
1617
+ - state.joint_position
1618
+ - state.gripper_position
1619
+ action_concat_order:
1620
+ - action.joint_position
1621
+ - action.gripper_position
1622
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
1623
+ default_instruction: Perform the default behavior.
1624
+ language_dropout_prob: 0.0
1625
+ always_use_default_instruction: false
1626
+ max_state_dim: 64
1627
+ max_action_dim: 36
1628
+ max_length: 512
1629
+ state_horizon: 1
1630
+ action_horizon: 24
1631
+ embodiment_tag_mapping:
1632
+ real_gr1_arms_only: 0
1633
+ real_gr1_arms_only_annotated: 1
1634
+ real_gr1_arms_waist: 2
1635
+ real_gr1_arms_waist_annotated: 3
1636
+ dexmg_gr1_arms_only_inspire: 4
1637
+ dexmg_gr1_arms_only_fourier: 5
1638
+ dexmg_gr1_arms_waist_fourier: 6
1639
+ robocasa_single_arm: 7
1640
+ onex_eve_gripper: 8
1641
+ robocasa_gr1_arms_only_inspire_hands: 9
1642
+ robocasa_gr1_arms_only_fourier_hands: 10
1643
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1644
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1645
+ robocasa_panda_omron: 13
1646
+ robocasa_bimanual_panda_parallel_gripper: 15
1647
+ robocasa_bimanual_panda_inspire_hand: 16
1648
+ oxe_droid: 17
1649
+ oxe_fractal: 18
1650
+ oxe_language_table: 19
1651
+ oxe_bridge: 20
1652
+ real_panda_single_arm: 21
1653
+ hot3d_hands_only: 23
1654
+ gr1_unified: 24
1655
+ robocasa_gr1_arms_waist_fourier_hands: 25
1656
+ agibot: 26
1657
+ lapa: 27
1658
+ oxe_mutex: 28
1659
+ oxe_roboset: 29
1660
+ oxe_plex: 30
1661
+ dream: 31
1662
+ yam: 32
1663
+ xdof: 22
1664
+ gr1_unified_segmentation: 14
1665
+ language_table_sim: 7
1666
+ gr1_isaac: 0
1667
+ sim_behavior_r1_pro: 31
1668
+ mecka_hands: 27
1669
+ real_r1_pro_sharpa: 28
1670
+ real_teleop_g1: 33
1671
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1672
+ modality_config_agibot:
1673
+ video:
1674
+ _target_: groot.vla.data.dataset.ModalityConfig
1675
+ delta_indices:
1676
+ - 0
1677
+ - 1
1678
+ - 2
1679
+ - 3
1680
+ - 4
1681
+ - 5
1682
+ - 6
1683
+ - 7
1684
+ - 8
1685
+ - 9
1686
+ - 10
1687
+ - 11
1688
+ - 12
1689
+ - 13
1690
+ - 14
1691
+ - 15
1692
+ - 16
1693
+ - 17
1694
+ - 18
1695
+ - 19
1696
+ - 20
1697
+ - 21
1698
+ - 22
1699
+ - 23
1700
+ - 24
1701
+ eval_delta_indices:
1702
+ - -3
1703
+ - -2
1704
+ - -1
1705
+ - 0
1706
+ modality_keys:
1707
+ - video.top_head
1708
+ - video.hand_left
1709
+ - video.hand_right
1710
+ state:
1711
+ _target_: groot.vla.data.dataset.ModalityConfig
1712
+ delta_indices:
1713
+ - 0
1714
+ modality_keys:
1715
+ - state.left_arm_joint_position
1716
+ - state.right_arm_joint_position
1717
+ - state.left_effector_position
1718
+ - state.right_effector_position
1719
+ - state.head_position
1720
+ - state.waist_position
1721
+ action:
1722
+ _target_: groot.vla.data.dataset.ModalityConfig
1723
+ delta_indices:
1724
+ - 0
1725
+ - 1
1726
+ - 2
1727
+ - 3
1728
+ - 4
1729
+ - 5
1730
+ - 6
1731
+ - 7
1732
+ - 8
1733
+ - 9
1734
+ - 10
1735
+ - 11
1736
+ - 12
1737
+ - 13
1738
+ - 14
1739
+ - 15
1740
+ - 16
1741
+ - 17
1742
+ - 18
1743
+ - 19
1744
+ - 20
1745
+ - 21
1746
+ - 22
1747
+ - 23
1748
+ modality_keys:
1749
+ - action.left_arm_joint_position
1750
+ - action.right_arm_joint_position
1751
+ - action.left_effector_position
1752
+ - action.right_effector_position
1753
+ - action.head_position
1754
+ - action.waist_position
1755
+ - action.robot_velocity
1756
+ language:
1757
+ _target_: groot.vla.data.dataset.ModalityConfig
1758
+ delta_indices:
1759
+ - 0
1760
+ modality_keys:
1761
+ - annotation.language.action_text
1762
+ transform_agibot:
1763
+ _target_: groot.vla.data.transform.ComposedModalityTransform
1764
+ transforms:
1765
+ - _target_: groot.vla.data.transform.VideoToTensor
1766
+ apply_to:
1767
+ - video.top_head
1768
+ - video.hand_left
1769
+ - video.hand_right
1770
+ - _target_: groot.vla.data.transform.VideoCrop
1771
+ apply_to:
1772
+ - video.top_head
1773
+ - video.hand_left
1774
+ - video.hand_right
1775
+ scale: 0.95
1776
+ mode: random
1777
+ - _target_: groot.vla.data.transform.VideoResize
1778
+ apply_to:
1779
+ - video.top_head
1780
+ - video.hand_left
1781
+ - video.hand_right
1782
+ height: 176
1783
+ width: 320
1784
+ interpolation: linear
1785
+ - _target_: groot.vla.data.transform.VideoColorJitter
1786
+ apply_to:
1787
+ - video.top_head
1788
+ - video.hand_left
1789
+ - video.hand_right
1790
+ brightness: 0.3
1791
+ contrast: 0.4
1792
+ saturation: 0.5
1793
+ hue: 0.08
1794
+ - _target_: groot.vla.data.transform.VideoToNumpy
1795
+ apply_to:
1796
+ - video.top_head
1797
+ - video.hand_left
1798
+ - video.hand_right
1799
+ - _target_: groot.vla.data.transform.StateActionToTensor
1800
+ apply_to:
1801
+ - state.left_arm_joint_position
1802
+ - state.right_arm_joint_position
1803
+ - state.left_effector_position
1804
+ - state.right_effector_position
1805
+ - state.head_position
1806
+ - state.waist_position
1807
+ - _target_: groot.vla.data.transform.StateActionTransform
1808
+ apply_to:
1809
+ - state.left_arm_joint_position
1810
+ - state.right_arm_joint_position
1811
+ - state.left_effector_position
1812
+ - state.right_effector_position
1813
+ - state.head_position
1814
+ - state.waist_position
1815
+ normalization_modes:
1816
+ state.left_arm_joint_position: q99
1817
+ state.right_arm_joint_position: q99
1818
+ state.left_effector_position: q99
1819
+ state.right_effector_position: q99
1820
+ state.head_position: q99
1821
+ state.waist_position: q99
1822
+ - _target_: groot.vla.data.transform.StateActionToTensor
1823
+ apply_to:
1824
+ - action.left_arm_joint_position
1825
+ - action.right_arm_joint_position
1826
+ - action.left_effector_position
1827
+ - action.right_effector_position
1828
+ - action.head_position
1829
+ - action.waist_position
1830
+ - action.robot_velocity
1831
+ - _target_: groot.vla.data.transform.StateActionTransform
1832
+ apply_to:
1833
+ - action.left_arm_joint_position
1834
+ - action.right_arm_joint_position
1835
+ - action.left_effector_position
1836
+ - action.right_effector_position
1837
+ - action.head_position
1838
+ - action.waist_position
1839
+ - action.robot_velocity
1840
+ normalization_modes:
1841
+ action.left_arm_joint_position: q99
1842
+ action.right_arm_joint_position: q99
1843
+ action.left_effector_position: q99
1844
+ action.right_effector_position: q99
1845
+ action.head_position: q99
1846
+ action.waist_position: q99
1847
+ action.robot_velocity: q99
1848
+ - _target_: groot.vla.data.transform.ConcatTransform
1849
+ video_concat_order:
1850
+ - video.top_head
1851
+ - video.hand_left
1852
+ - video.hand_right
1853
+ state_concat_order:
1854
+ - state.left_arm_joint_position
1855
+ - state.right_arm_joint_position
1856
+ - state.left_effector_position
1857
+ - state.right_effector_position
1858
+ - state.head_position
1859
+ - state.waist_position
1860
+ action_concat_order:
1861
+ - action.left_arm_joint_position
1862
+ - action.right_arm_joint_position
1863
+ - action.left_effector_position
1864
+ - action.right_effector_position
1865
+ - action.head_position
1866
+ - action.waist_position
1867
+ - action.robot_velocity
1868
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
1869
+ default_instruction: Perform the default behavior.
1870
+ language_dropout_prob: 0.0
1871
+ always_use_default_instruction: false
1872
+ max_state_dim: 64
1873
+ max_action_dim: 36
1874
+ max_length: 512
1875
+ state_horizon: 1
1876
+ action_horizon: 24
1877
+ embodiment_tag_mapping:
1878
+ real_gr1_arms_only: 0
1879
+ real_gr1_arms_only_annotated: 1
1880
+ real_gr1_arms_waist: 2
1881
+ real_gr1_arms_waist_annotated: 3
1882
+ dexmg_gr1_arms_only_inspire: 4
1883
+ dexmg_gr1_arms_only_fourier: 5
1884
+ dexmg_gr1_arms_waist_fourier: 6
1885
+ robocasa_single_arm: 7
1886
+ onex_eve_gripper: 8
1887
+ robocasa_gr1_arms_only_inspire_hands: 9
1888
+ robocasa_gr1_arms_only_fourier_hands: 10
1889
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
1890
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
1891
+ robocasa_panda_omron: 13
1892
+ robocasa_bimanual_panda_parallel_gripper: 15
1893
+ robocasa_bimanual_panda_inspire_hand: 16
1894
+ oxe_droid: 17
1895
+ oxe_fractal: 18
1896
+ oxe_language_table: 19
1897
+ oxe_bridge: 20
1898
+ real_panda_single_arm: 21
1899
+ hot3d_hands_only: 23
1900
+ gr1_unified: 24
1901
+ robocasa_gr1_arms_waist_fourier_hands: 25
1902
+ agibot: 26
1903
+ lapa: 27
1904
+ oxe_mutex: 28
1905
+ oxe_roboset: 29
1906
+ oxe_plex: 30
1907
+ dream: 31
1908
+ yam: 32
1909
+ xdof: 22
1910
+ gr1_unified_segmentation: 14
1911
+ language_table_sim: 7
1912
+ gr1_isaac: 0
1913
+ sim_behavior_r1_pro: 31
1914
+ mecka_hands: 27
1915
+ real_r1_pro_sharpa: 28
1916
+ real_teleop_g1: 33
1917
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
1918
+ modality_config_yam:
1919
+ video:
1920
+ _target_: groot.vla.data.dataset.ModalityConfig
1921
+ delta_indices:
1922
+ - 0
1923
+ - 1
1924
+ - 2
1925
+ - 3
1926
+ - 4
1927
+ - 5
1928
+ - 6
1929
+ - 7
1930
+ - 8
1931
+ - 9
1932
+ - 10
1933
+ - 11
1934
+ - 12
1935
+ - 13
1936
+ - 14
1937
+ - 15
1938
+ - 16
1939
+ - 17
1940
+ - 18
1941
+ - 19
1942
+ - 20
1943
+ - 21
1944
+ - 22
1945
+ - 23
1946
+ - 24
1947
+ eval_delta_indices:
1948
+ - 0
1949
+ modality_keys:
1950
+ - video.top_camera-images-rgb
1951
+ - video.left_camera-images-rgb
1952
+ - video.right_camera-images-rgb
1953
+ state:
1954
+ _target_: groot.vla.data.dataset.ModalityConfig
1955
+ delta_indices:
1956
+ - 0
1957
+ modality_keys:
1958
+ - state.left_joint_pos
1959
+ - state.left_gripper_pos
1960
+ - state.right_joint_pos
1961
+ - state.right_gripper_pos
1962
+ action:
1963
+ _target_: groot.vla.data.dataset.ModalityConfig
1964
+ delta_indices:
1965
+ - 0
1966
+ - 1
1967
+ - 2
1968
+ - 3
1969
+ - 4
1970
+ - 5
1971
+ - 6
1972
+ - 7
1973
+ - 8
1974
+ - 9
1975
+ - 10
1976
+ - 11
1977
+ - 12
1978
+ - 13
1979
+ - 14
1980
+ - 15
1981
+ - 16
1982
+ - 17
1983
+ - 18
1984
+ - 19
1985
+ - 20
1986
+ - 21
1987
+ - 22
1988
+ - 23
1989
+ modality_keys:
1990
+ - action.left_joint_pos
1991
+ - action.left_gripper_pos
1992
+ - action.right_joint_pos
1993
+ - action.right_gripper_pos
1994
+ language:
1995
+ _target_: groot.vla.data.dataset.ModalityConfig
1996
+ delta_indices:
1997
+ - 0
1998
+ modality_keys:
1999
+ - annotation.task
2000
+ transform_yam:
2001
+ _target_: groot.vla.data.transform.ComposedModalityTransform
2002
+ transforms:
2003
+ - _target_: groot.vla.data.transform.VideoToTensor
2004
+ apply_to:
2005
+ - video.top_camera-images-rgb
2006
+ - video.left_camera-images-rgb
2007
+ - video.right_camera-images-rgb
2008
+ - _target_: groot.vla.data.transform.VideoCrop
2009
+ apply_to:
2010
+ - video.top_camera-images-rgb
2011
+ - video.left_camera-images-rgb
2012
+ - video.right_camera-images-rgb
2013
+ scale: 0.95
2014
+ mode: random
2015
+ - _target_: groot.vla.data.transform.VideoResize
2016
+ apply_to:
2017
+ - video.top_camera-images-rgb
2018
+ - video.left_camera-images-rgb
2019
+ - video.right_camera-images-rgb
2020
+ height: 176
2021
+ width: 320
2022
+ interpolation: linear
2023
+ - _target_: groot.vla.data.transform.VideoColorJitter
2024
+ apply_to:
2025
+ - video.top_camera-images-rgb
2026
+ - video.left_camera-images-rgb
2027
+ - video.right_camera-images-rgb
2028
+ brightness: 0.3
2029
+ contrast: 0.4
2030
+ saturation: 0.5
2031
+ hue: 0.08
2032
+ - _target_: groot.vla.data.transform.VideoToNumpy
2033
+ apply_to:
2034
+ - video.top_camera-images-rgb
2035
+ - video.left_camera-images-rgb
2036
+ - video.right_camera-images-rgb
2037
+ - _target_: groot.vla.data.transform.StateActionToTensor
2038
+ apply_to:
2039
+ - state.left_joint_pos
2040
+ - state.left_gripper_pos
2041
+ - state.right_joint_pos
2042
+ - state.right_gripper_pos
2043
+ - _target_: groot.vla.data.transform.StateActionTransform
2044
+ apply_to:
2045
+ - state.left_joint_pos
2046
+ - state.left_gripper_pos
2047
+ - state.right_joint_pos
2048
+ - state.right_gripper_pos
2049
+ normalization_modes:
2050
+ state.left_joint_pos: q99
2051
+ state.left_gripper_pos: q99
2052
+ state.right_joint_pos: q99
2053
+ state.right_gripper_pos: q99
2054
+ - _target_: groot.vla.data.transform.StateActionToTensor
2055
+ apply_to:
2056
+ - action.left_joint_pos
2057
+ - action.left_gripper_pos
2058
+ - action.right_joint_pos
2059
+ - action.right_gripper_pos
2060
+ - _target_: groot.vla.data.transform.StateActionTransform
2061
+ apply_to:
2062
+ - action.left_joint_pos
2063
+ - action.left_gripper_pos
2064
+ - action.right_joint_pos
2065
+ - action.right_gripper_pos
2066
+ normalization_modes:
2067
+ action.left_joint_pos: q99
2068
+ action.left_gripper_pos: q99
2069
+ action.right_joint_pos: q99
2070
+ action.right_gripper_pos: q99
2071
+ - _target_: groot.vla.data.transform.ConcatTransform
2072
+ video_concat_order:
2073
+ - video.top_camera-images-rgb
2074
+ - video.left_camera-images-rgb
2075
+ - video.right_camera-images-rgb
2076
+ state_concat_order:
2077
+ - state.left_joint_pos
2078
+ - state.left_gripper_pos
2079
+ - state.right_joint_pos
2080
+ - state.right_gripper_pos
2081
+ action_concat_order:
2082
+ - action.left_joint_pos
2083
+ - action.left_gripper_pos
2084
+ - action.right_joint_pos
2085
+ - action.right_gripper_pos
2086
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
2087
+ default_instruction: Perform the default behavior.
2088
+ language_dropout_prob: 0.0
2089
+ always_use_default_instruction: false
2090
+ max_state_dim: 64
2091
+ max_action_dim: 36
2092
+ max_length: 512
2093
+ state_horizon: 1
2094
+ action_horizon: 24
2095
+ embodiment_tag_mapping:
2096
+ real_gr1_arms_only: 0
2097
+ real_gr1_arms_only_annotated: 1
2098
+ real_gr1_arms_waist: 2
2099
+ real_gr1_arms_waist_annotated: 3
2100
+ dexmg_gr1_arms_only_inspire: 4
2101
+ dexmg_gr1_arms_only_fourier: 5
2102
+ dexmg_gr1_arms_waist_fourier: 6
2103
+ robocasa_single_arm: 7
2104
+ onex_eve_gripper: 8
2105
+ robocasa_gr1_arms_only_inspire_hands: 9
2106
+ robocasa_gr1_arms_only_fourier_hands: 10
2107
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
2108
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
2109
+ robocasa_panda_omron: 13
2110
+ robocasa_bimanual_panda_parallel_gripper: 15
2111
+ robocasa_bimanual_panda_inspire_hand: 16
2112
+ oxe_droid: 17
2113
+ oxe_fractal: 18
2114
+ oxe_language_table: 19
2115
+ oxe_bridge: 20
2116
+ real_panda_single_arm: 21
2117
+ hot3d_hands_only: 23
2118
+ gr1_unified: 24
2119
+ robocasa_gr1_arms_waist_fourier_hands: 25
2120
+ agibot: 26
2121
+ lapa: 27
2122
+ oxe_mutex: 28
2123
+ oxe_roboset: 29
2124
+ oxe_plex: 30
2125
+ dream: 31
2126
+ yam: 32
2127
+ xdof: 22
2128
+ gr1_unified_segmentation: 14
2129
+ language_table_sim: 7
2130
+ gr1_isaac: 0
2131
+ sim_behavior_r1_pro: 31
2132
+ mecka_hands: 27
2133
+ real_r1_pro_sharpa: 28
2134
+ real_teleop_g1: 33
2135
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
2136
+ modality_config_real_teleop_g1:
2137
+ video:
2138
+ _target_: groot.vla.data.dataset.ModalityConfig
2139
+ delta_indices:
2140
+ - 0
2141
+ - 1
2142
+ - 2
2143
+ - 3
2144
+ - 4
2145
+ - 5
2146
+ - 6
2147
+ - 7
2148
+ - 8
2149
+ - 9
2150
+ - 10
2151
+ - 11
2152
+ - 12
2153
+ - 13
2154
+ - 14
2155
+ - 15
2156
+ - 16
2157
+ - 17
2158
+ - 18
2159
+ - 19
2160
+ - 20
2161
+ - 21
2162
+ - 22
2163
+ - 23
2164
+ - 24
2165
+ eval_delta_indices:
2166
+ - 0
2167
+ modality_keys:
2168
+ - video.egocentric
2169
+ state:
2170
+ _target_: groot.vla.data.dataset.ModalityConfig
2171
+ delta_indices:
2172
+ - 0
2173
+ modality_keys:
2174
+ - state.left_hand
2175
+ - state.right_hand
2176
+ - state.left_arm
2177
+ - state.right_arm
2178
+ - state.rpy
2179
+ - state.height
2180
+ action:
2181
+ _target_: groot.vla.data.dataset.ModalityConfig
2182
+ delta_indices:
2183
+ - 0
2184
+ - 1
2185
+ - 2
2186
+ - 3
2187
+ - 4
2188
+ - 5
2189
+ - 6
2190
+ - 7
2191
+ - 8
2192
+ - 9
2193
+ - 10
2194
+ - 11
2195
+ - 12
2196
+ - 13
2197
+ - 14
2198
+ - 15
2199
+ - 16
2200
+ - 17
2201
+ - 18
2202
+ - 19
2203
+ - 20
2204
+ - 21
2205
+ - 22
2206
+ - 23
2207
+ modality_keys:
2208
+ - action.left_hand
2209
+ - action.right_hand
2210
+ - action.left_arm
2211
+ - action.right_arm
2212
+ - action.rpy
2213
+ - action.height
2214
+ - action.torso_vx
2215
+ - action.torso_vy
2216
+ - action.torso_vyaw
2217
+ - action.torso_dyaw
2218
+ language:
2219
+ _target_: groot.vla.data.dataset.ModalityConfig
2220
+ delta_indices:
2221
+ - 0
2222
+ modality_keys:
2223
+ - annotation.language.language_instruction
2224
+ transform_real_teleop_g1:
2225
+ _target_: groot.vla.data.transform.ComposedModalityTransform
2226
+ transforms:
2227
+ - _target_: groot.vla.data.transform.VideoToTensor
2228
+ apply_to:
2229
+ - video.egocentric
2230
+ - _target_: groot.vla.data.transform.VideoCrop
2231
+ apply_to:
2232
+ - video.egocentric
2233
+ scale: 0.95
2234
+ mode: random
2235
+ - _target_: groot.vla.data.transform.VideoResize
2236
+ apply_to:
2237
+ - video.egocentric
2238
+ height: 176
2239
+ width: 320
2240
+ interpolation: linear
2241
+ - _target_: groot.vla.data.transform.VideoColorJitter
2242
+ apply_to:
2243
+ - video.egocentric
2244
+ brightness: 0.3
2245
+ contrast: 0.4
2246
+ saturation: 0.5
2247
+ hue: 0.08
2248
+ - _target_: groot.vla.data.transform.VideoToNumpy
2249
+ apply_to:
2250
+ - video.egocentric
2251
+ - _target_: groot.vla.data.transform.StateActionToTensor
2252
+ apply_to:
2253
+ - state.left_hand
2254
+ - state.right_hand
2255
+ - state.left_arm
2256
+ - state.right_arm
2257
+ - state.rpy
2258
+ - state.height
2259
+ - _target_: groot.vla.data.transform.StateActionTransform
2260
+ apply_to:
2261
+ - state.left_hand
2262
+ - state.right_hand
2263
+ - state.left_arm
2264
+ - state.right_arm
2265
+ - state.rpy
2266
+ - state.height
2267
+ normalization_modes:
2268
+ state.left_hand: q99
2269
+ state.right_hand: q99
2270
+ state.left_arm: q99
2271
+ state.right_arm: q99
2272
+ state.rpy: q99
2273
+ state.height: q99
2274
+ - _target_: groot.vla.data.transform.StateActionToTensor
2275
+ apply_to:
2276
+ - action.left_hand
2277
+ - action.right_hand
2278
+ - action.left_arm
2279
+ - action.right_arm
2280
+ - action.rpy
2281
+ - action.height
2282
+ - action.torso_vx
2283
+ - action.torso_vy
2284
+ - action.torso_vyaw
2285
+ - action.torso_dyaw
2286
+ - _target_: groot.vla.data.transform.StateActionTransform
2287
+ apply_to:
2288
+ - action.left_hand
2289
+ - action.right_hand
2290
+ - action.left_arm
2291
+ - action.right_arm
2292
+ - action.rpy
2293
+ - action.height
2294
+ - action.torso_vx
2295
+ - action.torso_vy
2296
+ - action.torso_vyaw
2297
+ - action.torso_dyaw
2298
+ normalization_modes:
2299
+ action.left_hand: q99
2300
+ action.right_hand: q99
2301
+ action.left_arm: q99
2302
+ action.right_arm: q99
2303
+ action.rpy: q99
2304
+ action.height: q99
2305
+ action.torso_vx: q99
2306
+ action.torso_vy: q99
2307
+ action.torso_vyaw: q99
2308
+ action.torso_dyaw: q99
2309
+ - _target_: groot.vla.data.transform.ConcatTransform
2310
+ video_concat_order:
2311
+ - video.egocentric
2312
+ state_concat_order:
2313
+ - state.left_hand
2314
+ - state.right_hand
2315
+ - state.left_arm
2316
+ - state.right_arm
2317
+ - state.rpy
2318
+ - state.height
2319
+ action_concat_order:
2320
+ - action.left_hand
2321
+ - action.right_hand
2322
+ - action.left_arm
2323
+ - action.right_arm
2324
+ - action.rpy
2325
+ - action.height
2326
+ - action.torso_vx
2327
+ - action.torso_vy
2328
+ - action.torso_vyaw
2329
+ - action.torso_dyaw
2330
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
2331
+ default_instruction: Perform the default behavior.
2332
+ language_dropout_prob: 0.0
2333
+ always_use_default_instruction: false
2334
+ max_state_dim: 64
2335
+ max_action_dim: 36
2336
+ max_length: 512
2337
+ state_horizon: 1
2338
+ action_horizon: 24
2339
+ embodiment_tag_mapping:
2340
+ real_gr1_arms_only: 0
2341
+ real_gr1_arms_only_annotated: 1
2342
+ real_gr1_arms_waist: 2
2343
+ real_gr1_arms_waist_annotated: 3
2344
+ dexmg_gr1_arms_only_inspire: 4
2345
+ dexmg_gr1_arms_only_fourier: 5
2346
+ dexmg_gr1_arms_waist_fourier: 6
2347
+ robocasa_single_arm: 7
2348
+ onex_eve_gripper: 8
2349
+ robocasa_gr1_arms_only_inspire_hands: 9
2350
+ robocasa_gr1_arms_only_fourier_hands: 10
2351
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
2352
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
2353
+ robocasa_panda_omron: 13
2354
+ robocasa_bimanual_panda_parallel_gripper: 15
2355
+ robocasa_bimanual_panda_inspire_hand: 16
2356
+ oxe_droid: 17
2357
+ oxe_fractal: 18
2358
+ oxe_language_table: 19
2359
+ oxe_bridge: 20
2360
+ real_panda_single_arm: 21
2361
+ hot3d_hands_only: 23
2362
+ gr1_unified: 24
2363
+ robocasa_gr1_arms_waist_fourier_hands: 25
2364
+ agibot: 26
2365
+ lapa: 27
2366
+ oxe_mutex: 28
2367
+ oxe_roboset: 29
2368
+ oxe_plex: 30
2369
+ dream: 31
2370
+ yam: 32
2371
+ xdof: 22
2372
+ gr1_unified_segmentation: 14
2373
+ language_table_sim: 7
2374
+ gr1_isaac: 0
2375
+ sim_behavior_r1_pro: 31
2376
+ mecka_hands: 27
2377
+ real_r1_pro_sharpa: 28
2378
+ real_teleop_g1: 33
2379
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
2380
+ modality_configs:
2381
+ oxe_droid:
2382
+ video:
2383
+ _target_: groot.vla.data.dataset.ModalityConfig
2384
+ delta_indices:
2385
+ - 0
2386
+ - 1
2387
+ - 2
2388
+ - 3
2389
+ - 4
2390
+ - 5
2391
+ - 6
2392
+ - 7
2393
+ - 8
2394
+ - 9
2395
+ - 10
2396
+ - 11
2397
+ - 12
2398
+ - 13
2399
+ - 14
2400
+ - 15
2401
+ - 16
2402
+ - 17
2403
+ - 18
2404
+ - 19
2405
+ - 20
2406
+ - 21
2407
+ - 22
2408
+ - 23
2409
+ - 24
2410
+ eval_delta_indices:
2411
+ - 0
2412
+ modality_keys:
2413
+ - video.exterior_image_1_left
2414
+ - video.exterior_image_2_left
2415
+ - video.wrist_image_left
2416
+ state:
2417
+ _target_: groot.vla.data.dataset.ModalityConfig
2418
+ delta_indices:
2419
+ - 0
2420
+ modality_keys:
2421
+ - state.joint_position
2422
+ - state.gripper_position
2423
+ action:
2424
+ _target_: groot.vla.data.dataset.ModalityConfig
2425
+ delta_indices:
2426
+ - 0
2427
+ - 1
2428
+ - 2
2429
+ - 3
2430
+ - 4
2431
+ - 5
2432
+ - 6
2433
+ - 7
2434
+ - 8
2435
+ - 9
2436
+ - 10
2437
+ - 11
2438
+ - 12
2439
+ - 13
2440
+ - 14
2441
+ - 15
2442
+ - 16
2443
+ - 17
2444
+ - 18
2445
+ - 19
2446
+ - 20
2447
+ - 21
2448
+ - 22
2449
+ - 23
2450
+ modality_keys:
2451
+ - action.joint_position
2452
+ - action.gripper_position
2453
+ language:
2454
+ _target_: groot.vla.data.dataset.ModalityConfig
2455
+ delta_indices:
2456
+ - 0
2457
+ modality_keys:
2458
+ - annotation.language.language_instruction
2459
+ - annotation.language.language_instruction_2
2460
+ - annotation.language.language_instruction_3
2461
+ lapa_action:
2462
+ _target_: groot.vla.data.dataset.ModalityConfig
2463
+ delta_indices:
2464
+ - 0
2465
+ modality_keys:
2466
+ - lapa_action
2467
+ agibot:
2468
+ video:
2469
+ _target_: groot.vla.data.dataset.ModalityConfig
2470
+ delta_indices:
2471
+ - 0
2472
+ - 1
2473
+ - 2
2474
+ - 3
2475
+ - 4
2476
+ - 5
2477
+ - 6
2478
+ - 7
2479
+ - 8
2480
+ - 9
2481
+ - 10
2482
+ - 11
2483
+ - 12
2484
+ - 13
2485
+ - 14
2486
+ - 15
2487
+ - 16
2488
+ - 17
2489
+ - 18
2490
+ - 19
2491
+ - 20
2492
+ - 21
2493
+ - 22
2494
+ - 23
2495
+ - 24
2496
+ eval_delta_indices:
2497
+ - -3
2498
+ - -2
2499
+ - -1
2500
+ - 0
2501
+ modality_keys:
2502
+ - video.top_head
2503
+ - video.hand_left
2504
+ - video.hand_right
2505
+ state:
2506
+ _target_: groot.vla.data.dataset.ModalityConfig
2507
+ delta_indices:
2508
+ - 0
2509
+ modality_keys:
2510
+ - state.left_arm_joint_position
2511
+ - state.right_arm_joint_position
2512
+ - state.left_effector_position
2513
+ - state.right_effector_position
2514
+ - state.head_position
2515
+ - state.waist_position
2516
+ action:
2517
+ _target_: groot.vla.data.dataset.ModalityConfig
2518
+ delta_indices:
2519
+ - 0
2520
+ - 1
2521
+ - 2
2522
+ - 3
2523
+ - 4
2524
+ - 5
2525
+ - 6
2526
+ - 7
2527
+ - 8
2528
+ - 9
2529
+ - 10
2530
+ - 11
2531
+ - 12
2532
+ - 13
2533
+ - 14
2534
+ - 15
2535
+ - 16
2536
+ - 17
2537
+ - 18
2538
+ - 19
2539
+ - 20
2540
+ - 21
2541
+ - 22
2542
+ - 23
2543
+ modality_keys:
2544
+ - action.left_arm_joint_position
2545
+ - action.right_arm_joint_position
2546
+ - action.left_effector_position
2547
+ - action.right_effector_position
2548
+ - action.head_position
2549
+ - action.waist_position
2550
+ - action.robot_velocity
2551
+ language:
2552
+ _target_: groot.vla.data.dataset.ModalityConfig
2553
+ delta_indices:
2554
+ - 0
2555
+ modality_keys:
2556
+ - annotation.language.action_text
2557
+ yam:
2558
+ video:
2559
+ _target_: groot.vla.data.dataset.ModalityConfig
2560
+ delta_indices:
2561
+ - 0
2562
+ - 1
2563
+ - 2
2564
+ - 3
2565
+ - 4
2566
+ - 5
2567
+ - 6
2568
+ - 7
2569
+ - 8
2570
+ - 9
2571
+ - 10
2572
+ - 11
2573
+ - 12
2574
+ - 13
2575
+ - 14
2576
+ - 15
2577
+ - 16
2578
+ - 17
2579
+ - 18
2580
+ - 19
2581
+ - 20
2582
+ - 21
2583
+ - 22
2584
+ - 23
2585
+ - 24
2586
+ eval_delta_indices:
2587
+ - 0
2588
+ modality_keys:
2589
+ - video.top_camera-images-rgb
2590
+ - video.left_camera-images-rgb
2591
+ - video.right_camera-images-rgb
2592
+ state:
2593
+ _target_: groot.vla.data.dataset.ModalityConfig
2594
+ delta_indices:
2595
+ - 0
2596
+ modality_keys:
2597
+ - state.left_joint_pos
2598
+ - state.left_gripper_pos
2599
+ - state.right_joint_pos
2600
+ - state.right_gripper_pos
2601
+ action:
2602
+ _target_: groot.vla.data.dataset.ModalityConfig
2603
+ delta_indices:
2604
+ - 0
2605
+ - 1
2606
+ - 2
2607
+ - 3
2608
+ - 4
2609
+ - 5
2610
+ - 6
2611
+ - 7
2612
+ - 8
2613
+ - 9
2614
+ - 10
2615
+ - 11
2616
+ - 12
2617
+ - 13
2618
+ - 14
2619
+ - 15
2620
+ - 16
2621
+ - 17
2622
+ - 18
2623
+ - 19
2624
+ - 20
2625
+ - 21
2626
+ - 22
2627
+ - 23
2628
+ modality_keys:
2629
+ - action.left_joint_pos
2630
+ - action.left_gripper_pos
2631
+ - action.right_joint_pos
2632
+ - action.right_gripper_pos
2633
+ language:
2634
+ _target_: groot.vla.data.dataset.ModalityConfig
2635
+ delta_indices:
2636
+ - 0
2637
+ modality_keys:
2638
+ - annotation.task
2639
+ real_teleop_g1:
2640
+ video:
2641
+ _target_: groot.vla.data.dataset.ModalityConfig
2642
+ delta_indices:
2643
+ - 0
2644
+ - 1
2645
+ - 2
2646
+ - 3
2647
+ - 4
2648
+ - 5
2649
+ - 6
2650
+ - 7
2651
+ - 8
2652
+ - 9
2653
+ - 10
2654
+ - 11
2655
+ - 12
2656
+ - 13
2657
+ - 14
2658
+ - 15
2659
+ - 16
2660
+ - 17
2661
+ - 18
2662
+ - 19
2663
+ - 20
2664
+ - 21
2665
+ - 22
2666
+ - 23
2667
+ - 24
2668
+ eval_delta_indices:
2669
+ - 0
2670
+ modality_keys:
2671
+ - video.egocentric
2672
+ state:
2673
+ _target_: groot.vla.data.dataset.ModalityConfig
2674
+ delta_indices:
2675
+ - 0
2676
+ modality_keys:
2677
+ - state.left_hand
2678
+ - state.right_hand
2679
+ - state.left_arm
2680
+ - state.right_arm
2681
+ - state.rpy
2682
+ - state.height
2683
+ action:
2684
+ _target_: groot.vla.data.dataset.ModalityConfig
2685
+ delta_indices:
2686
+ - 0
2687
+ - 1
2688
+ - 2
2689
+ - 3
2690
+ - 4
2691
+ - 5
2692
+ - 6
2693
+ - 7
2694
+ - 8
2695
+ - 9
2696
+ - 10
2697
+ - 11
2698
+ - 12
2699
+ - 13
2700
+ - 14
2701
+ - 15
2702
+ - 16
2703
+ - 17
2704
+ - 18
2705
+ - 19
2706
+ - 20
2707
+ - 21
2708
+ - 22
2709
+ - 23
2710
+ modality_keys:
2711
+ - action.left_hand
2712
+ - action.right_hand
2713
+ - action.left_arm
2714
+ - action.right_arm
2715
+ - action.rpy
2716
+ - action.height
2717
+ - action.torso_vx
2718
+ - action.torso_vy
2719
+ - action.torso_vyaw
2720
+ - action.torso_dyaw
2721
+ language:
2722
+ _target_: groot.vla.data.dataset.ModalityConfig
2723
+ delta_indices:
2724
+ - 0
2725
+ modality_keys:
2726
+ - annotation.language.language_instruction
2727
+ transforms:
2728
+ oxe_droid:
2729
+ _target_: groot.vla.data.transform.ComposedModalityTransform
2730
+ transforms:
2731
+ - _target_: groot.vla.data.transform.VideoToTensor
2732
+ apply_to:
2733
+ - video.exterior_image_1_left
2734
+ - video.exterior_image_2_left
2735
+ - video.wrist_image_left
2736
+ - _target_: groot.vla.data.transform.VideoCrop
2737
+ apply_to:
2738
+ - video.exterior_image_1_left
2739
+ - video.exterior_image_2_left
2740
+ - video.wrist_image_left
2741
+ scale: 0.95
2742
+ mode: random
2743
+ - _target_: groot.vla.data.transform.VideoResize
2744
+ apply_to:
2745
+ - video.exterior_image_1_left
2746
+ - video.exterior_image_2_left
2747
+ - video.wrist_image_left
2748
+ height: 176
2749
+ width: 320
2750
+ interpolation: linear
2751
+ - _target_: groot.vla.data.transform.VideoColorJitter
2752
+ apply_to:
2753
+ - video.exterior_image_1_left
2754
+ - video.exterior_image_2_left
2755
+ - video.wrist_image_left
2756
+ brightness: 0.3
2757
+ contrast: 0.4
2758
+ saturation: 0.5
2759
+ hue: 0.08
2760
+ - _target_: groot.vla.data.transform.VideoToNumpy
2761
+ apply_to:
2762
+ - video.exterior_image_1_left
2763
+ - video.exterior_image_2_left
2764
+ - video.wrist_image_left
2765
+ - _target_: groot.vla.data.transform.StateActionToTensor
2766
+ apply_to:
2767
+ - state.joint_position
2768
+ - state.gripper_position
2769
+ - _target_: groot.vla.data.transform.StateActionTransform
2770
+ apply_to:
2771
+ - state.joint_position
2772
+ - state.gripper_position
2773
+ normalization_modes:
2774
+ state.joint_position: q99
2775
+ state.gripper_position: q99
2776
+ - _target_: groot.vla.data.transform.StateActionToTensor
2777
+ apply_to:
2778
+ - action.joint_position
2779
+ - action.gripper_position
2780
+ - _target_: groot.vla.data.transform.StateActionTransform
2781
+ apply_to:
2782
+ - action.joint_position
2783
+ - action.gripper_position
2784
+ normalization_modes:
2785
+ action.joint_position: q99
2786
+ action.gripper_position: q99
2787
+ - _target_: groot.vla.data.transform.ConcatTransform
2788
+ video_concat_order:
2789
+ - video.exterior_image_1_left
2790
+ - video.exterior_image_2_left
2791
+ - video.wrist_image_left
2792
+ state_concat_order:
2793
+ - state.joint_position
2794
+ - state.gripper_position
2795
+ action_concat_order:
2796
+ - action.joint_position
2797
+ - action.gripper_position
2798
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
2799
+ default_instruction: Perform the default behavior.
2800
+ language_dropout_prob: 0.0
2801
+ always_use_default_instruction: false
2802
+ max_state_dim: 64
2803
+ max_action_dim: 36
2804
+ max_length: 512
2805
+ state_horizon: 1
2806
+ action_horizon: 24
2807
+ embodiment_tag_mapping:
2808
+ real_gr1_arms_only: 0
2809
+ real_gr1_arms_only_annotated: 1
2810
+ real_gr1_arms_waist: 2
2811
+ real_gr1_arms_waist_annotated: 3
2812
+ dexmg_gr1_arms_only_inspire: 4
2813
+ dexmg_gr1_arms_only_fourier: 5
2814
+ dexmg_gr1_arms_waist_fourier: 6
2815
+ robocasa_single_arm: 7
2816
+ onex_eve_gripper: 8
2817
+ robocasa_gr1_arms_only_inspire_hands: 9
2818
+ robocasa_gr1_arms_only_fourier_hands: 10
2819
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
2820
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
2821
+ robocasa_panda_omron: 13
2822
+ robocasa_bimanual_panda_parallel_gripper: 15
2823
+ robocasa_bimanual_panda_inspire_hand: 16
2824
+ oxe_droid: 17
2825
+ oxe_fractal: 18
2826
+ oxe_language_table: 19
2827
+ oxe_bridge: 20
2828
+ real_panda_single_arm: 21
2829
+ hot3d_hands_only: 23
2830
+ gr1_unified: 24
2831
+ robocasa_gr1_arms_waist_fourier_hands: 25
2832
+ agibot: 26
2833
+ lapa: 27
2834
+ oxe_mutex: 28
2835
+ oxe_roboset: 29
2836
+ oxe_plex: 30
2837
+ dream: 31
2838
+ yam: 32
2839
+ xdof: 22
2840
+ gr1_unified_segmentation: 14
2841
+ language_table_sim: 7
2842
+ gr1_isaac: 0
2843
+ sim_behavior_r1_pro: 31
2844
+ mecka_hands: 27
2845
+ real_r1_pro_sharpa: 28
2846
+ real_teleop_g1: 33
2847
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
2848
+ agibot:
2849
+ _target_: groot.vla.data.transform.ComposedModalityTransform
2850
+ transforms:
2851
+ - _target_: groot.vla.data.transform.VideoToTensor
2852
+ apply_to:
2853
+ - video.top_head
2854
+ - video.hand_left
2855
+ - video.hand_right
2856
+ - _target_: groot.vla.data.transform.VideoCrop
2857
+ apply_to:
2858
+ - video.top_head
2859
+ - video.hand_left
2860
+ - video.hand_right
2861
+ scale: 0.95
2862
+ mode: random
2863
+ - _target_: groot.vla.data.transform.VideoResize
2864
+ apply_to:
2865
+ - video.top_head
2866
+ - video.hand_left
2867
+ - video.hand_right
2868
+ height: 176
2869
+ width: 320
2870
+ interpolation: linear
2871
+ - _target_: groot.vla.data.transform.VideoColorJitter
2872
+ apply_to:
2873
+ - video.top_head
2874
+ - video.hand_left
2875
+ - video.hand_right
2876
+ brightness: 0.3
2877
+ contrast: 0.4
2878
+ saturation: 0.5
2879
+ hue: 0.08
2880
+ - _target_: groot.vla.data.transform.VideoToNumpy
2881
+ apply_to:
2882
+ - video.top_head
2883
+ - video.hand_left
2884
+ - video.hand_right
2885
+ - _target_: groot.vla.data.transform.StateActionToTensor
2886
+ apply_to:
2887
+ - state.left_arm_joint_position
2888
+ - state.right_arm_joint_position
2889
+ - state.left_effector_position
2890
+ - state.right_effector_position
2891
+ - state.head_position
2892
+ - state.waist_position
2893
+ - _target_: groot.vla.data.transform.StateActionTransform
2894
+ apply_to:
2895
+ - state.left_arm_joint_position
2896
+ - state.right_arm_joint_position
2897
+ - state.left_effector_position
2898
+ - state.right_effector_position
2899
+ - state.head_position
2900
+ - state.waist_position
2901
+ normalization_modes:
2902
+ state.left_arm_joint_position: q99
2903
+ state.right_arm_joint_position: q99
2904
+ state.left_effector_position: q99
2905
+ state.right_effector_position: q99
2906
+ state.head_position: q99
2907
+ state.waist_position: q99
2908
+ - _target_: groot.vla.data.transform.StateActionToTensor
2909
+ apply_to:
2910
+ - action.left_arm_joint_position
2911
+ - action.right_arm_joint_position
2912
+ - action.left_effector_position
2913
+ - action.right_effector_position
2914
+ - action.head_position
2915
+ - action.waist_position
2916
+ - action.robot_velocity
2917
+ - _target_: groot.vla.data.transform.StateActionTransform
2918
+ apply_to:
2919
+ - action.left_arm_joint_position
2920
+ - action.right_arm_joint_position
2921
+ - action.left_effector_position
2922
+ - action.right_effector_position
2923
+ - action.head_position
2924
+ - action.waist_position
2925
+ - action.robot_velocity
2926
+ normalization_modes:
2927
+ action.left_arm_joint_position: q99
2928
+ action.right_arm_joint_position: q99
2929
+ action.left_effector_position: q99
2930
+ action.right_effector_position: q99
2931
+ action.head_position: q99
2932
+ action.waist_position: q99
2933
+ action.robot_velocity: q99
2934
+ - _target_: groot.vla.data.transform.ConcatTransform
2935
+ video_concat_order:
2936
+ - video.top_head
2937
+ - video.hand_left
2938
+ - video.hand_right
2939
+ state_concat_order:
2940
+ - state.left_arm_joint_position
2941
+ - state.right_arm_joint_position
2942
+ - state.left_effector_position
2943
+ - state.right_effector_position
2944
+ - state.head_position
2945
+ - state.waist_position
2946
+ action_concat_order:
2947
+ - action.left_arm_joint_position
2948
+ - action.right_arm_joint_position
2949
+ - action.left_effector_position
2950
+ - action.right_effector_position
2951
+ - action.head_position
2952
+ - action.waist_position
2953
+ - action.robot_velocity
2954
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
2955
+ default_instruction: Perform the default behavior.
2956
+ language_dropout_prob: 0.0
2957
+ always_use_default_instruction: false
2958
+ max_state_dim: 64
2959
+ max_action_dim: 36
2960
+ max_length: 512
2961
+ state_horizon: 1
2962
+ action_horizon: 24
2963
+ embodiment_tag_mapping:
2964
+ real_gr1_arms_only: 0
2965
+ real_gr1_arms_only_annotated: 1
2966
+ real_gr1_arms_waist: 2
2967
+ real_gr1_arms_waist_annotated: 3
2968
+ dexmg_gr1_arms_only_inspire: 4
2969
+ dexmg_gr1_arms_only_fourier: 5
2970
+ dexmg_gr1_arms_waist_fourier: 6
2971
+ robocasa_single_arm: 7
2972
+ onex_eve_gripper: 8
2973
+ robocasa_gr1_arms_only_inspire_hands: 9
2974
+ robocasa_gr1_arms_only_fourier_hands: 10
2975
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
2976
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
2977
+ robocasa_panda_omron: 13
2978
+ robocasa_bimanual_panda_parallel_gripper: 15
2979
+ robocasa_bimanual_panda_inspire_hand: 16
2980
+ oxe_droid: 17
2981
+ oxe_fractal: 18
2982
+ oxe_language_table: 19
2983
+ oxe_bridge: 20
2984
+ real_panda_single_arm: 21
2985
+ hot3d_hands_only: 23
2986
+ gr1_unified: 24
2987
+ robocasa_gr1_arms_waist_fourier_hands: 25
2988
+ agibot: 26
2989
+ lapa: 27
2990
+ oxe_mutex: 28
2991
+ oxe_roboset: 29
2992
+ oxe_plex: 30
2993
+ dream: 31
2994
+ yam: 32
2995
+ xdof: 22
2996
+ gr1_unified_segmentation: 14
2997
+ language_table_sim: 7
2998
+ gr1_isaac: 0
2999
+ sim_behavior_r1_pro: 31
3000
+ mecka_hands: 27
3001
+ real_r1_pro_sharpa: 28
3002
+ real_teleop_g1: 33
3003
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
3004
+ yam:
3005
+ _target_: groot.vla.data.transform.ComposedModalityTransform
3006
+ transforms:
3007
+ - _target_: groot.vla.data.transform.VideoToTensor
3008
+ apply_to:
3009
+ - video.top_camera-images-rgb
3010
+ - video.left_camera-images-rgb
3011
+ - video.right_camera-images-rgb
3012
+ - _target_: groot.vla.data.transform.VideoCrop
3013
+ apply_to:
3014
+ - video.top_camera-images-rgb
3015
+ - video.left_camera-images-rgb
3016
+ - video.right_camera-images-rgb
3017
+ scale: 0.95
3018
+ mode: random
3019
+ - _target_: groot.vla.data.transform.VideoResize
3020
+ apply_to:
3021
+ - video.top_camera-images-rgb
3022
+ - video.left_camera-images-rgb
3023
+ - video.right_camera-images-rgb
3024
+ height: 176
3025
+ width: 320
3026
+ interpolation: linear
3027
+ - _target_: groot.vla.data.transform.VideoColorJitter
3028
+ apply_to:
3029
+ - video.top_camera-images-rgb
3030
+ - video.left_camera-images-rgb
3031
+ - video.right_camera-images-rgb
3032
+ brightness: 0.3
3033
+ contrast: 0.4
3034
+ saturation: 0.5
3035
+ hue: 0.08
3036
+ - _target_: groot.vla.data.transform.VideoToNumpy
3037
+ apply_to:
3038
+ - video.top_camera-images-rgb
3039
+ - video.left_camera-images-rgb
3040
+ - video.right_camera-images-rgb
3041
+ - _target_: groot.vla.data.transform.StateActionToTensor
3042
+ apply_to:
3043
+ - state.left_joint_pos
3044
+ - state.left_gripper_pos
3045
+ - state.right_joint_pos
3046
+ - state.right_gripper_pos
3047
+ - _target_: groot.vla.data.transform.StateActionTransform
3048
+ apply_to:
3049
+ - state.left_joint_pos
3050
+ - state.left_gripper_pos
3051
+ - state.right_joint_pos
3052
+ - state.right_gripper_pos
3053
+ normalization_modes:
3054
+ state.left_joint_pos: q99
3055
+ state.left_gripper_pos: q99
3056
+ state.right_joint_pos: q99
3057
+ state.right_gripper_pos: q99
3058
+ - _target_: groot.vla.data.transform.StateActionToTensor
3059
+ apply_to:
3060
+ - action.left_joint_pos
3061
+ - action.left_gripper_pos
3062
+ - action.right_joint_pos
3063
+ - action.right_gripper_pos
3064
+ - _target_: groot.vla.data.transform.StateActionTransform
3065
+ apply_to:
3066
+ - action.left_joint_pos
3067
+ - action.left_gripper_pos
3068
+ - action.right_joint_pos
3069
+ - action.right_gripper_pos
3070
+ normalization_modes:
3071
+ action.left_joint_pos: q99
3072
+ action.left_gripper_pos: q99
3073
+ action.right_joint_pos: q99
3074
+ action.right_gripper_pos: q99
3075
+ - _target_: groot.vla.data.transform.ConcatTransform
3076
+ video_concat_order:
3077
+ - video.top_camera-images-rgb
3078
+ - video.left_camera-images-rgb
3079
+ - video.right_camera-images-rgb
3080
+ state_concat_order:
3081
+ - state.left_joint_pos
3082
+ - state.left_gripper_pos
3083
+ - state.right_joint_pos
3084
+ - state.right_gripper_pos
3085
+ action_concat_order:
3086
+ - action.left_joint_pos
3087
+ - action.left_gripper_pos
3088
+ - action.right_joint_pos
3089
+ - action.right_gripper_pos
3090
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
3091
+ default_instruction: Perform the default behavior.
3092
+ language_dropout_prob: 0.0
3093
+ always_use_default_instruction: false
3094
+ max_state_dim: 64
3095
+ max_action_dim: 36
3096
+ max_length: 512
3097
+ state_horizon: 1
3098
+ action_horizon: 24
3099
+ embodiment_tag_mapping:
3100
+ real_gr1_arms_only: 0
3101
+ real_gr1_arms_only_annotated: 1
3102
+ real_gr1_arms_waist: 2
3103
+ real_gr1_arms_waist_annotated: 3
3104
+ dexmg_gr1_arms_only_inspire: 4
3105
+ dexmg_gr1_arms_only_fourier: 5
3106
+ dexmg_gr1_arms_waist_fourier: 6
3107
+ robocasa_single_arm: 7
3108
+ onex_eve_gripper: 8
3109
+ robocasa_gr1_arms_only_inspire_hands: 9
3110
+ robocasa_gr1_arms_only_fourier_hands: 10
3111
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
3112
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
3113
+ robocasa_panda_omron: 13
3114
+ robocasa_bimanual_panda_parallel_gripper: 15
3115
+ robocasa_bimanual_panda_inspire_hand: 16
3116
+ oxe_droid: 17
3117
+ oxe_fractal: 18
3118
+ oxe_language_table: 19
3119
+ oxe_bridge: 20
3120
+ real_panda_single_arm: 21
3121
+ hot3d_hands_only: 23
3122
+ gr1_unified: 24
3123
+ robocasa_gr1_arms_waist_fourier_hands: 25
3124
+ agibot: 26
3125
+ lapa: 27
3126
+ oxe_mutex: 28
3127
+ oxe_roboset: 29
3128
+ oxe_plex: 30
3129
+ dream: 31
3130
+ yam: 32
3131
+ xdof: 22
3132
+ gr1_unified_segmentation: 14
3133
+ language_table_sim: 7
3134
+ gr1_isaac: 0
3135
+ sim_behavior_r1_pro: 31
3136
+ mecka_hands: 27
3137
+ real_r1_pro_sharpa: 28
3138
+ real_teleop_g1: 33
3139
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
3140
+ real_teleop_g1:
3141
+ _target_: groot.vla.data.transform.ComposedModalityTransform
3142
+ transforms:
3143
+ - _target_: groot.vla.data.transform.VideoToTensor
3144
+ apply_to:
3145
+ - video.egocentric
3146
+ - _target_: groot.vla.data.transform.VideoCrop
3147
+ apply_to:
3148
+ - video.egocentric
3149
+ scale: 0.95
3150
+ mode: random
3151
+ - _target_: groot.vla.data.transform.VideoResize
3152
+ apply_to:
3153
+ - video.egocentric
3154
+ height: 176
3155
+ width: 320
3156
+ interpolation: linear
3157
+ - _target_: groot.vla.data.transform.VideoColorJitter
3158
+ apply_to:
3159
+ - video.egocentric
3160
+ brightness: 0.3
3161
+ contrast: 0.4
3162
+ saturation: 0.5
3163
+ hue: 0.08
3164
+ - _target_: groot.vla.data.transform.VideoToNumpy
3165
+ apply_to:
3166
+ - video.egocentric
3167
+ - _target_: groot.vla.data.transform.StateActionToTensor
3168
+ apply_to:
3169
+ - state.left_hand
3170
+ - state.right_hand
3171
+ - state.left_arm
3172
+ - state.right_arm
3173
+ - state.rpy
3174
+ - state.height
3175
+ - _target_: groot.vla.data.transform.StateActionTransform
3176
+ apply_to:
3177
+ - state.left_hand
3178
+ - state.right_hand
3179
+ - state.left_arm
3180
+ - state.right_arm
3181
+ - state.rpy
3182
+ - state.height
3183
+ normalization_modes:
3184
+ state.left_hand: q99
3185
+ state.right_hand: q99
3186
+ state.left_arm: q99
3187
+ state.right_arm: q99
3188
+ state.rpy: q99
3189
+ state.height: q99
3190
+ - _target_: groot.vla.data.transform.StateActionToTensor
3191
+ apply_to:
3192
+ - action.left_hand
3193
+ - action.right_hand
3194
+ - action.left_arm
3195
+ - action.right_arm
3196
+ - action.rpy
3197
+ - action.height
3198
+ - action.torso_vx
3199
+ - action.torso_vy
3200
+ - action.torso_vyaw
3201
+ - action.torso_dyaw
3202
+ - _target_: groot.vla.data.transform.StateActionTransform
3203
+ apply_to:
3204
+ - action.left_hand
3205
+ - action.right_hand
3206
+ - action.left_arm
3207
+ - action.right_arm
3208
+ - action.rpy
3209
+ - action.height
3210
+ - action.torso_vx
3211
+ - action.torso_vy
3212
+ - action.torso_vyaw
3213
+ - action.torso_dyaw
3214
+ normalization_modes:
3215
+ action.left_hand: q99
3216
+ action.right_hand: q99
3217
+ action.left_arm: q99
3218
+ action.right_arm: q99
3219
+ action.rpy: q99
3220
+ action.height: q99
3221
+ action.torso_vx: q99
3222
+ action.torso_vy: q99
3223
+ action.torso_vyaw: q99
3224
+ action.torso_dyaw: q99
3225
+ - _target_: groot.vla.data.transform.ConcatTransform
3226
+ video_concat_order:
3227
+ - video.egocentric
3228
+ state_concat_order:
3229
+ - state.left_hand
3230
+ - state.right_hand
3231
+ - state.left_arm
3232
+ - state.right_arm
3233
+ - state.rpy
3234
+ - state.height
3235
+ action_concat_order:
3236
+ - action.left_hand
3237
+ - action.right_hand
3238
+ - action.left_arm
3239
+ - action.right_arm
3240
+ - action.rpy
3241
+ - action.height
3242
+ - action.torso_vx
3243
+ - action.torso_vy
3244
+ - action.torso_vyaw
3245
+ - action.torso_dyaw
3246
+ - _target_: groot.vla.model.dreamzero.transform.dreamzero_cotrain.DreamTransform
3247
+ default_instruction: Perform the default behavior.
3248
+ language_dropout_prob: 0.0
3249
+ always_use_default_instruction: false
3250
+ max_state_dim: 64
3251
+ max_action_dim: 36
3252
+ max_length: 512
3253
+ state_horizon: 1
3254
+ action_horizon: 24
3255
+ embodiment_tag_mapping:
3256
+ real_gr1_arms_only: 0
3257
+ real_gr1_arms_only_annotated: 1
3258
+ real_gr1_arms_waist: 2
3259
+ real_gr1_arms_waist_annotated: 3
3260
+ dexmg_gr1_arms_only_inspire: 4
3261
+ dexmg_gr1_arms_only_fourier: 5
3262
+ dexmg_gr1_arms_waist_fourier: 6
3263
+ robocasa_single_arm: 7
3264
+ onex_eve_gripper: 8
3265
+ robocasa_gr1_arms_only_inspire_hands: 9
3266
+ robocasa_gr1_arms_only_fourier_hands: 10
3267
+ robocasa_gr1_fixed_lower_body_inspire_hands: 11
3268
+ robocasa_gr1_fixed_lower_body_fourier_hands: 12
3269
+ robocasa_panda_omron: 13
3270
+ robocasa_bimanual_panda_parallel_gripper: 15
3271
+ robocasa_bimanual_panda_inspire_hand: 16
3272
+ oxe_droid: 17
3273
+ oxe_fractal: 18
3274
+ oxe_language_table: 19
3275
+ oxe_bridge: 20
3276
+ real_panda_single_arm: 21
3277
+ hot3d_hands_only: 23
3278
+ gr1_unified: 24
3279
+ robocasa_gr1_arms_waist_fourier_hands: 25
3280
+ agibot: 26
3281
+ lapa: 27
3282
+ oxe_mutex: 28
3283
+ oxe_roboset: 29
3284
+ oxe_plex: 30
3285
+ dream: 31
3286
+ yam: 32
3287
+ xdof: 22
3288
+ gr1_unified_segmentation: 14
3289
+ language_table_sim: 7
3290
+ gr1_isaac: 0
3291
+ sim_behavior_r1_pro: 31
3292
+ mecka_hands: 27
3293
+ real_r1_pro_sharpa: 28
3294
+ real_teleop_g1: 33
3295
+ tokenizer_path: /hfm/boqian/liboqian_data/checkpoints/umt5-xxl
3296
+ metadata_versions:
3297
+ oxe_droid: '0221'
3298
+ agibot: '0221'
3299
+ yam: '0221'
3300
+ real_teleop_g1: '0221'
3301
+ fps:
3302
+ yam: 30
3303
+ real_teleop_g1: 30
3304
+ relative_action: true
3305
+ relative_action_per_horizon: false
3306
+ relative_action_keys:
3307
+ - left_hand
3308
+ - right_hand
3309
+ - left_arm
3310
+ - right_arm
3311
+ - rpy
3312
+ - height
3313
+ max_chunk_size: 4
3314
+ dataset_shard_sampling_rate: 0.1
3315
+ mixture_dataset_cls: groot.vla.data.dataset.lerobot_sharded.ShardedLeRobotMixtureDataset.from_mixture_spec
3316
+ single_dataset_cls: groot.vla.data.dataset.lerobot_sharded.ShardedLeRobotSubLangSingleActionChunkDatasetDROID
3317
+ real_teleop_g1_data_root: /hfm/boqian/liboqian_data/data/real_data/gear/g1/Pick_bottle_and_turn_and_pour_into_cup
3318
+ total_training_steps: 1048576000000
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/experiment_cfg/metadata.json ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "real_teleop_g1": {
3
+ "statistics": {
4
+ "state": {
5
+ "left_hand": {
6
+ "max": [
7
+ -0.15012885630130768,
8
+ 0.9005824327468872,
9
+ 1.240552306175232,
10
+ -0.001318894443102181,
11
+ -0.029013408347964287,
12
+ -0.0026860591024160385,
13
+ -0.032535798847675323
14
+ ],
15
+ "min": [
16
+ -1.0365108251571655,
17
+ 0.3635019063949585,
18
+ 0.6077943444252014,
19
+ -0.8983967900276184,
20
+ -1.4610702991485596,
21
+ -0.9210805296897888,
22
+ -1.2054297924041748
23
+ ],
24
+ "mean": [
25
+ -0.7106521057783574,
26
+ 0.7397562400548329,
27
+ 0.7932539759105839,
28
+ -0.18307749281402536,
29
+ -0.3996594926481087,
30
+ -0.22440346846797105,
31
+ -0.3648747590553514
32
+ ],
33
+ "std": [
34
+ 0.1800224325521231,
35
+ 0.08892492705684825,
36
+ 0.17347929747061464,
37
+ 0.1250548816433564,
38
+ 0.14322181312600368,
39
+ 0.12746796614686143,
40
+ 0.1004095686025728
41
+ ],
42
+ "q01": [
43
+ -1.0010048174858093,
44
+ 0.5051653385162354,
45
+ 0.6091835498809814,
46
+ -0.5192373794317245,
47
+ -0.7626793467998505,
48
+ -0.5990579128265381,
49
+ -0.6296854817867279
50
+ ],
51
+ "q99": [
52
+ -0.35215023159980774,
53
+ 0.8828813862800599,
54
+ 1.1794202327728271,
55
+ -0.012645758632570512,
56
+ -0.08962718859314919,
57
+ -0.01236837636679411,
58
+ -0.14306417107582092
59
+ ]
60
+ },
61
+ "right_hand": {
62
+ "max": [
63
+ 0.013422729447484016,
64
+ -0.0029037927743047476,
65
+ -0.4307878315448761,
66
+ 1.1691919565200806,
67
+ 1.5508151054382324,
68
+ 1.008196473121643,
69
+ 1.4388744831085205
70
+ ],
71
+ "min": [
72
+ -0.9901090860366821,
73
+ -0.8877645134925842,
74
+ -1.2071785926818848,
75
+ -0.03135598078370094,
76
+ 0.03392980992794037,
77
+ 0.023357456550002098,
78
+ 0.019666638225317
79
+ ],
80
+ "mean": [
81
+ -0.4129581264587618,
82
+ -0.4582520757612821,
83
+ -0.664123836057758,
84
+ 0.4952124991587111,
85
+ 0.9231042625678995,
86
+ 0.5022273107153479,
87
+ 0.669409344487608
88
+ ],
89
+ "std": [
90
+ 0.19006874361243062,
91
+ 0.11976727609652792,
92
+ 0.10119243113959121,
93
+ 0.2266161525462106,
94
+ 0.4655465641238765,
95
+ 0.21432269096327275,
96
+ 0.31467715432459836
97
+ ],
98
+ "q01": [
99
+ -0.9081442606449127,
100
+ -0.8110105991363525,
101
+ -0.9831316405534745,
102
+ 0.023696813061833383,
103
+ 0.037188550531864165,
104
+ 0.024981562048196793,
105
+ 0.02064503453671932
106
+ ],
107
+ "q99": [
108
+ -0.10628944709897117,
109
+ -0.18123969539999962,
110
+ -0.5385138392448425,
111
+ 0.9067972785234452,
112
+ 1.4886025190353394,
113
+ 0.871771514415741,
114
+ 1.3075256443023682
115
+ ]
116
+ },
117
+ "left_arm": {
118
+ "max": [
119
+ 0.6784030199050903,
120
+ 0.5408600568771362,
121
+ 0.261579692363739,
122
+ 1.2853081226348877,
123
+ 0.6677011251449585,
124
+ 0.01986505277454853,
125
+ 0.5779042840003967
126
+ ],
127
+ "min": [
128
+ -0.6697743535041809,
129
+ -0.04027898237109184,
130
+ -0.6574546098709106,
131
+ -0.02732403390109539,
132
+ -0.18146513402462006,
133
+ -1.2264176607131958,
134
+ -0.8434670567512512
135
+ ],
136
+ "mean": [
137
+ 0.06576749867032948,
138
+ 0.25579782742676094,
139
+ -0.06004482964572005,
140
+ 0.808768983211113,
141
+ 0.10850686643277818,
142
+ -0.7254681426808276,
143
+ -0.2878231007376268
144
+ ],
145
+ "std": [
146
+ 0.16998997778227107,
147
+ 0.09886796724754655,
148
+ 0.1532526690961023,
149
+ 0.3788245539398352,
150
+ 0.1304561854786525,
151
+ 0.3305712410884026,
152
+ 0.20458272924414123
153
+ ],
154
+ "q01": [
155
+ -0.472993403673172,
156
+ 0.01923264417797327,
157
+ -0.4175487527251244,
158
+ 0.11523827515542508,
159
+ -0.13969017088413238,
160
+ -1.190878005027771,
161
+ -0.7066819667816162
162
+ ],
163
+ "q99": [
164
+ 0.4539884480834007,
165
+ 0.5193723440170288,
166
+ 0.2179892195761203,
167
+ 1.2710349559783936,
168
+ 0.6194926702976227,
169
+ -0.09952061586081994,
170
+ 0.17473116487264606
171
+ ]
172
+ },
173
+ "right_arm": {
174
+ "max": [
175
+ 0.9002070426940918,
176
+ 0.11829628795385361,
177
+ 0.8515990376472473,
178
+ 1.2844573259353638,
179
+ 0.8022719621658325,
180
+ 0.6399767994880676,
181
+ 1.2358516454696655
182
+ ],
183
+ "min": [
184
+ -1.2026649713516235,
185
+ -0.6969066858291626,
186
+ -0.815610408782959,
187
+ -0.9874761700630188,
188
+ -1.6946533918380737,
189
+ -1.4213329553604126,
190
+ -1.342279314994812
191
+ ],
192
+ "mean": [
193
+ -0.0662775668920557,
194
+ -0.1345353524434497,
195
+ 0.0634982026036378,
196
+ -0.10989526924031702,
197
+ -0.05263142642975812,
198
+ -0.12096835662218453,
199
+ 0.120861711803444
200
+ ],
201
+ "std": [
202
+ 0.4008494462356859,
203
+ 0.1700786431413106,
204
+ 0.21295159275658426,
205
+ 0.43649950708158436,
206
+ 0.4497950790039162,
207
+ 0.36514804111543175,
208
+ 0.1967684071659634
209
+ ],
210
+ "q01": [
211
+ -0.950262793302536,
212
+ -0.5722128254175186,
213
+ -0.4087426000833511,
214
+ -0.8998615843057632,
215
+ -1.200806084871292,
216
+ -1.1520812797546387,
217
+ -0.3325912955403328
218
+ ],
219
+ "q99": [
220
+ 0.7100517773628232,
221
+ 0.10524750150740145,
222
+ 0.6194546324014664,
223
+ 0.9450081789493561,
224
+ 0.559214276075363,
225
+ 0.5243198603391647,
226
+ 0.6973656332492828
227
+ ]
228
+ },
229
+ "rpy": {
230
+ "max": [
231
+ 0.1827869415283203,
232
+ 0.12582087516784668,
233
+ 0.7600905299186707
234
+ ],
235
+ "min": [
236
+ -0.25714462995529175,
237
+ -0.27553248405456543,
238
+ -0.3924732208251953
239
+ ],
240
+ "mean": [
241
+ -0.07450471914543803,
242
+ -0.016358907944713997,
243
+ 0.08461220301037418
244
+ ],
245
+ "std": [
246
+ 0.052686434459038575,
247
+ 0.05521563296798423,
248
+ 0.1871730371455983
249
+ ],
250
+ "q01": [
251
+ -0.21652273923158646,
252
+ -0.15164137467741967,
253
+ -0.3411825066804886
254
+ ],
255
+ "q99": [
256
+ 0.023736179061233972,
257
+ 0.0924126328527927,
258
+ 0.6843682318925858
259
+ ]
260
+ },
261
+ "height": {
262
+ "max": [
263
+ 0.75
264
+ ],
265
+ "min": [
266
+ 0.6516909599304199
267
+ ],
268
+ "mean": [
269
+ 0.717019414027915
270
+ ],
271
+ "std": [
272
+ 0.010611909948945071
273
+ ],
274
+ "q01": [
275
+ 0.681534378528595
276
+ ],
277
+ "q99": [
278
+ 0.7377596861124038
279
+ ]
280
+ }
281
+ },
282
+ "action": {
283
+ "left_hand": {
284
+ "max": [
285
+ 0.5982731580734253,
286
+ 0.5278768539428711,
287
+ 0.5354012846946716,
288
+ 0.4014817903516814,
289
+ 0.4918059706687927,
290
+ 0.44213195890188217,
291
+ 0.4230584502220154
292
+ ],
293
+ "min": [
294
+ -0.6301521956920624,
295
+ -0.5149323344230652,
296
+ -0.529521644115448,
297
+ -0.6856191083788872,
298
+ -1.158537894487381,
299
+ -0.7385352402925491,
300
+ -0.905200719833374
301
+ ],
302
+ "mean": [
303
+ -0.017651338260789542,
304
+ 0.00866811522079485,
305
+ -0.004330770698072518,
306
+ -0.002544540625456693,
307
+ -0.0005896838413398195,
308
+ 0.002361588407552847,
309
+ 0.004366535660068055
310
+ ],
311
+ "std": [
312
+ 0.06554604732418758,
313
+ 0.07154837844491334,
314
+ 0.12856598959026014,
315
+ 0.03597542702397461,
316
+ 0.04581648449077367,
317
+ 0.03962922545114253,
318
+ 0.04346845484086631
319
+ ],
320
+ "q01": [
321
+ -0.24043308198451993,
322
+ -0.19401812851428984,
323
+ -0.39614349603652954,
324
+ -0.10097884107381103,
325
+ -0.14891792684793473,
326
+ -0.11517959907650946,
327
+ -0.1279512584209442
328
+ ],
329
+ "q99": [
330
+ 0.23256101012229896,
331
+ 0.20524268746376023,
332
+ 0.3933871388435364,
333
+ 0.09811923948582232,
334
+ 0.12937735170125958,
335
+ 0.1224899165794337,
336
+ 0.11734045296907396
337
+ ]
338
+ },
339
+ "right_hand": {
340
+ "max": [
341
+ 0.8790446370840073,
342
+ 0.642131395637989,
343
+ 0.5467694997787476,
344
+ 1.2996619567275047,
345
+ 1.698824219405651,
346
+ 1.07933783903718,
347
+ 1.2983299642801285
348
+ ],
349
+ "min": [
350
+ -0.8118728846311569,
351
+ -0.6551126688718796,
352
+ -0.8104255795478821,
353
+ -0.940872801351361,
354
+ -1.4264521131990477,
355
+ -0.9354669973254204,
356
+ -0.9567366391420364
357
+ ],
358
+ "mean": [
359
+ 0.06989124645612069,
360
+ -0.18283122662561244,
361
+ -0.07939327808296306,
362
+ 0.13485396014136192,
363
+ 0.1019227924721336,
364
+ 0.12847191988697357,
365
+ 0.06420547800869135
366
+ ],
367
+ "std": [
368
+ 0.14218628429822006,
369
+ 0.14446828671114784,
370
+ 0.1265010717040926,
371
+ 0.1855492922965264,
372
+ 0.2240967671588938,
373
+ 0.17045304446399806,
374
+ 0.1757324556093506
375
+ ],
376
+ "q01": [
377
+ -0.35978180170059204,
378
+ -0.4319852590560913,
379
+ -0.36317331790924073,
380
+ -0.523832806199789,
381
+ -0.6789782047271729,
382
+ -0.4287099301815033,
383
+ -0.4532649278640747
384
+ ],
385
+ "q99": [
386
+ 0.4718186043202869,
387
+ 0.19760750830173343,
388
+ 0.25330804288387265,
389
+ 0.6829912588000264,
390
+ 0.8806305289268437,
391
+ 0.6021148726344102,
392
+ 0.6443318486213678
393
+ ]
394
+ },
395
+ "left_arm": {
396
+ "max": [
397
+ 0.48457716405391693,
398
+ 0.3535273000597954,
399
+ 0.21352535486221313,
400
+ 0.7616467773914337,
401
+ 0.6007281094789505,
402
+ 0.41783052682876587,
403
+ 0.5165230333805084
404
+ ],
405
+ "min": [
406
+ -0.6373330056667328,
407
+ -0.15849913656711578,
408
+ -0.5613232888281345,
409
+ -0.701464831829071,
410
+ -0.2516954243183136,
411
+ -0.8871668800711632,
412
+ -0.4376880154013634
413
+ ],
414
+ "mean": [
415
+ -0.011776995941582026,
416
+ 0.00991253342440134,
417
+ -0.006852865379075551,
418
+ 0.010552169557516157,
419
+ 0.001214240309897302,
420
+ -0.014453163590611127,
421
+ -0.010611657202882456
422
+ ],
423
+ "std": [
424
+ 0.03602045474094356,
425
+ 0.024036875417496602,
426
+ 0.02602605089260403,
427
+ 0.05981475365626236,
428
+ 0.028429534988341422,
429
+ 0.08916377427180806,
430
+ 0.03528055113076741
431
+ ],
432
+ "q01": [
433
+ -0.1545458011329174,
434
+ -0.031217777729034422,
435
+ -0.11606558486819267,
436
+ -0.0627044416964054,
437
+ -0.056227393448352814,
438
+ -0.5197785004973411,
439
+ -0.156750052748248
440
+ ],
441
+ "q99": [
442
+ 0.05294616930186727,
443
+ 0.11603573272004716,
444
+ 0.02942010760307312,
445
+ 0.32693205326795444,
446
+ 0.11353727197274502,
447
+ 0.06767475279048074,
448
+ 0.06325854659080493
449
+ ]
450
+ },
451
+ "right_arm": {
452
+ "max": [
453
+ 1.0019388496875763,
454
+ 0.6778036952018738,
455
+ 0.8202316761016846,
456
+ 1.2868973910808563,
457
+ 1.6054365634918213,
458
+ 0.9513278678059578,
459
+ 1.0065587792778388
460
+ ],
461
+ "min": [
462
+ -0.8566206395626068,
463
+ -0.5352737102657557,
464
+ -0.9050570726394653,
465
+ -1.1766326129436493,
466
+ -1.1665689051151276,
467
+ -1.226991206407547,
468
+ -1.4597684778273106
469
+ ],
470
+ "mean": [
471
+ 0.0011831602733654706,
472
+ 0.004668234701338991,
473
+ -0.00464666805688978,
474
+ -0.008819726561502726,
475
+ 0.01680292255365519,
476
+ -0.023272413508623503,
477
+ 0.001828295062135536
478
+ ],
479
+ "std": [
480
+ 0.15114015459104163,
481
+ 0.10717935148199742,
482
+ 0.12210241645133885,
483
+ 0.18724624428113243,
484
+ 0.21873655609526446,
485
+ 0.18543346050682308,
486
+ 0.1150104843557695
487
+ ],
488
+ "q01": [
489
+ -0.39952214881777764,
490
+ -0.25475079081952573,
491
+ -0.4339929953217506,
492
+ -0.645755667425692,
493
+ -0.5597020275890827,
494
+ -0.640592061728239,
495
+ -0.3658384716138244
496
+ ],
497
+ "q99": [
498
+ 0.47152970274910266,
499
+ 0.3625141793861984,
500
+ 0.3024886786937714,
501
+ 0.5137429375201458,
502
+ 0.7895279735326737,
503
+ 0.5061310570687053,
504
+ 0.30551581159233987
505
+ ]
506
+ },
507
+ "rpy": {
508
+ "max": [
509
+ 0.2602043002843857,
510
+ 0.0978180319070816,
511
+ 0.4442971870303154
512
+ ],
513
+ "min": [
514
+ -0.2339760884642601,
515
+ -0.27553248405456543,
516
+ -0.44536447897553444
517
+ ],
518
+ "mean": [
519
+ -0.0005869882495184713,
520
+ 0.0015875524968715301,
521
+ 0.0008563187639556434
522
+ ],
523
+ "std": [
524
+ 0.015963769360634213,
525
+ 0.009780051628911945,
526
+ 0.024366577111250988
527
+ ],
528
+ "q01": [
529
+ -0.049293445050716395,
530
+ -0.023501918860711155,
531
+ -0.0595750443637371
532
+ ],
533
+ "q99": [
534
+ 0.05025968039408321,
535
+ 0.028102314658462975,
536
+ 0.0824947778135533
537
+ ]
538
+ },
539
+ "height": {
540
+ "max": [
541
+ 0.04315239191055298
542
+ ],
543
+ "min": [
544
+ -0.08728241920471191
545
+ ],
546
+ "mean": [
547
+ 5.191507986045299e-05
548
+ ],
549
+ "std": [
550
+ 0.004163189516331377
551
+ ],
552
+ "q01": [
553
+ -0.012046998739242552
554
+ ],
555
+ "q99": [
556
+ 0.012092745304107644
557
+ ]
558
+ },
559
+ "torso_vx": {
560
+ "max": [
561
+ 0.3499999940395355
562
+ ],
563
+ "min": [
564
+ 0.0
565
+ ],
566
+ "mean": [
567
+ 0.017269024247052768
568
+ ],
569
+ "std": [
570
+ 0.07580197349073346
571
+ ],
572
+ "q01": [
573
+ 0.0
574
+ ],
575
+ "q99": [
576
+ 0.3499999940395355
577
+ ]
578
+ },
579
+ "torso_vy": {
580
+ "max": [
581
+ 0.5
582
+ ],
583
+ "min": [
584
+ 0.0
585
+ ],
586
+ "mean": [
587
+ 0.0002320065992988245
588
+ ],
589
+ "std": [
590
+ 0.01076798368252486
591
+ ],
592
+ "q01": [
593
+ 0.0
594
+ ],
595
+ "q99": [
596
+ 0.0
597
+ ]
598
+ },
599
+ "torso_vyaw": {
600
+ "max": [
601
+ 0.5
602
+ ],
603
+ "min": [
604
+ -0.5
605
+ ],
606
+ "mean": [
607
+ -0.03733115137363985
608
+ ],
609
+ "std": [
610
+ 0.1199053455233051
611
+ ],
612
+ "q01": [
613
+ -0.5
614
+ ],
615
+ "q99": [
616
+ 0.0
617
+ ]
618
+ },
619
+ "torso_dyaw": {
620
+ "max": [
621
+ 0.3072107136249542
622
+ ],
623
+ "min": [
624
+ -0.1875186562538147
625
+ ],
626
+ "mean": [
627
+ 0.016342195046893805
628
+ ],
629
+ "std": [
630
+ 0.048962870720188865
631
+ ],
632
+ "q01": [
633
+ -0.005849878406152129
634
+ ],
635
+ "q99": [
636
+ 0.22258896410465237
637
+ ]
638
+ }
639
+ }
640
+ },
641
+ "modalities": {
642
+ "video": {
643
+ "egocentric": {
644
+ "resolution": [
645
+ 640,
646
+ 480
647
+ ],
648
+ "channels": 3,
649
+ "fps": 30.0
650
+ }
651
+ },
652
+ "state": {
653
+ "left_hand": {
654
+ "absolute": true,
655
+ "rotation_type": null,
656
+ "shape": [
657
+ 7
658
+ ],
659
+ "continuous": true
660
+ },
661
+ "right_hand": {
662
+ "absolute": true,
663
+ "rotation_type": null,
664
+ "shape": [
665
+ 7
666
+ ],
667
+ "continuous": true
668
+ },
669
+ "left_arm": {
670
+ "absolute": true,
671
+ "rotation_type": null,
672
+ "shape": [
673
+ 7
674
+ ],
675
+ "continuous": true
676
+ },
677
+ "right_arm": {
678
+ "absolute": true,
679
+ "rotation_type": null,
680
+ "shape": [
681
+ 7
682
+ ],
683
+ "continuous": true
684
+ },
685
+ "rpy": {
686
+ "absolute": true,
687
+ "rotation_type": null,
688
+ "shape": [
689
+ 3
690
+ ],
691
+ "continuous": true
692
+ },
693
+ "height": {
694
+ "absolute": true,
695
+ "rotation_type": null,
696
+ "shape": [
697
+ 1
698
+ ],
699
+ "continuous": true
700
+ }
701
+ },
702
+ "action": {
703
+ "left_hand": {
704
+ "absolute": true,
705
+ "rotation_type": null,
706
+ "shape": [
707
+ 7
708
+ ],
709
+ "continuous": true
710
+ },
711
+ "right_hand": {
712
+ "absolute": true,
713
+ "rotation_type": null,
714
+ "shape": [
715
+ 7
716
+ ],
717
+ "continuous": true
718
+ },
719
+ "left_arm": {
720
+ "absolute": true,
721
+ "rotation_type": null,
722
+ "shape": [
723
+ 7
724
+ ],
725
+ "continuous": true
726
+ },
727
+ "right_arm": {
728
+ "absolute": true,
729
+ "rotation_type": null,
730
+ "shape": [
731
+ 7
732
+ ],
733
+ "continuous": true
734
+ },
735
+ "rpy": {
736
+ "absolute": true,
737
+ "rotation_type": null,
738
+ "shape": [
739
+ 3
740
+ ],
741
+ "continuous": true
742
+ },
743
+ "height": {
744
+ "absolute": true,
745
+ "rotation_type": null,
746
+ "shape": [
747
+ 1
748
+ ],
749
+ "continuous": true
750
+ },
751
+ "torso_vx": {
752
+ "absolute": false,
753
+ "rotation_type": null,
754
+ "shape": [
755
+ 1
756
+ ],
757
+ "continuous": true
758
+ },
759
+ "torso_vy": {
760
+ "absolute": false,
761
+ "rotation_type": null,
762
+ "shape": [
763
+ 1
764
+ ],
765
+ "continuous": true
766
+ },
767
+ "torso_vyaw": {
768
+ "absolute": false,
769
+ "rotation_type": null,
770
+ "shape": [
771
+ 1
772
+ ],
773
+ "continuous": true
774
+ },
775
+ "torso_dyaw": {
776
+ "absolute": false,
777
+ "rotation_type": null,
778
+ "shape": [
779
+ 1
780
+ ],
781
+ "continuous": true
782
+ }
783
+ }
784
+ },
785
+ "embodiment_tag": "real_teleop_g1"
786
+ }
787
+ }
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/loss_log.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/wandb/debug-internal.log ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-03-19T11:07:03.568457075Z","level":"INFO","msg":"stream: starting","core version":"0.25.0"}
2
+ {"time":"2026-03-19T11:07:03.817295367Z","level":"INFO","msg":"stream: created new stream","id":"lsme06f2"}
3
+ {"time":"2026-03-19T11:07:03.817394477Z","level":"INFO","msg":"handler: started","stream_id":"lsme06f2"}
4
+ {"time":"2026-03-19T11:07:03.817531838Z","level":"INFO","msg":"stream: started","id":"lsme06f2"}
5
+ {"time":"2026-03-19T11:07:03.817558849Z","level":"INFO","msg":"sender: started","stream_id":"lsme06f2"}
6
+ {"time":"2026-03-19T11:07:03.817564649Z","level":"INFO","msg":"writer: started","stream_id":"lsme06f2"}
7
+ {"time":"2026-03-20T18:00:34.85408512Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/boqianli/dreamzero/lsme06f2/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
8
+ {"time":"2026-03-20T20:47:26.570329788Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/boqianli/dreamzero/lsme06f2/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
9
+ {"time":"2026-03-20T20:48:53.069085604Z","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/files/boqianli/dreamzero/lsme06f2/file_stream","body":"{\"error\":\"context deadline exceeded\"}"}
10
+ {"time":"2026-03-20T20:56:21.788979982Z","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/files/boqianli/dreamzero/lsme06f2/file_stream","body":"{\"error\":\"context deadline exceeded\"}"}
11
+ {"time":"2026-03-20T20:58:30.788847673Z","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/files/boqianli/dreamzero/lsme06f2/file_stream","body":"{\"error\":\"context deadline exceeded\"}"}
12
+ {"time":"2026-03-20T21:01:49.472533676Z","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/files/boqianli/dreamzero/lsme06f2/file_stream","body":"{\"error\":\"context deadline exceeded\"}"}
13
+ {"time":"2026-03-20T21:03:33.794344816Z","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/files/boqianli/dreamzero/lsme06f2/file_stream","body":"{\"error\":\"context deadline exceeded\"}"}
14
+ {"time":"2026-03-20T21:04:30.913563045Z","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/files/boqianli/dreamzero/lsme06f2/file_stream","body":"{\"error\":\"context deadline exceeded\"}"}
15
+ {"time":"2026-03-20T21:06:13.266987441Z","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/files/boqianli/dreamzero/lsme06f2/file_stream","body":"{\"error\":\"context deadline exceeded\"}"}
16
+ {"time":"2026-03-23T07:12:08.820179122Z","level":"INFO","msg":"stream: closing","id":"lsme06f2"}
17
+ {"time":"2026-03-23T07:12:10.471712011Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
18
+ {"time":"2026-03-23T07:12:10.6442904Z","level":"INFO","msg":"handler: closed","stream_id":"lsme06f2"}
19
+ {"time":"2026-03-23T07:12:10.644415291Z","level":"INFO","msg":"sender: closed","stream_id":"lsme06f2"}
20
+ {"time":"2026-03-23T07:12:10.644451071Z","level":"INFO","msg":"stream: closed","id":"lsme06f2"}
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-19 11:07:03,312 INFO MainThread:1976195 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0
2
+ 2026-03-19 11:07:03,312 INFO MainThread:1976195 [wandb_setup.py:_flush():81] Configure stats pid to 1976195
3
+ 2026-03-19 11:07:03,312 INFO MainThread:1976195 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-03-19 11:07:03,312 INFO MainThread:1976195 [wandb_init.py:setup_run_log_directory():717] Logging user logs to checkpoints/dreamzero_real_teleop_g1_full_finetune/wandb/run-20260319_110703-lsme06f2/logs/debug.log
5
+ 2026-03-19 11:07:03,312 INFO MainThread:1976195 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to checkpoints/dreamzero_real_teleop_g1_full_finetune/wandb/run-20260319_110703-lsme06f2/logs/debug-internal.log
6
+ 2026-03-19 11:07:03,312 INFO MainThread:1976195 [wandb_init.py:init():844] calling init triggers
7
+ 2026-03-19 11:07:03,312 INFO MainThread:1976195 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'_wandb': {}}
9
+ 2026-03-19 11:07:03,312 INFO MainThread:1976195 [wandb_init.py:init():892] starting backend
10
+ 2026-03-19 11:07:03,562 INFO MainThread:1976195 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-03-19 11:07:03,564 INFO MainThread:1976195 [wandb_init.py:init():903] backend started and connected
12
+ 2026-03-19 11:07:03,565 INFO MainThread:1976195 [wandb_init.py:init():973] updated telemetry
13
+ 2026-03-19 11:07:03,570 INFO MainThread:1976195 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-03-19 11:07:04,794 INFO MainThread:1976195 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-03-19 11:07:04,941 INFO MainThread:1976195 [wandb_run.py:_console_start():2524] atexit reg
16
+ 2026-03-19 11:07:04,942 INFO MainThread:1976195 [wandb_run.py:_redirect():2373] redirect: wrap_raw
17
+ 2026-03-19 11:07:04,942 INFO MainThread:1976195 [wandb_run.py:_redirect():2442] Wrapping output streams.
18
+ 2026-03-19 11:07:04,942 INFO MainThread:1976195 [wandb_run.py:_redirect():2465] Redirects installed.
19
+ 2026-03-19 11:07:04,944 INFO MainThread:1976195 [wandb_init.py:init():1082] run started, returning control to user process
20
+ 2026-03-19 11:07:04,946 INFO MainThread:1976195 [wandb_run.py:_config_callback():1403] config_cb None None {'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': None, 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': './checkpoints/dreamzero_real_teleop_g1_full_finetune', '_attn_implementation_autoset': True, 'transformers_version': '4.51.3', 'model_dtype': 'float32', 'hidden_size': 0, 'action_horizon': 24, 'action_dim': 36, 'backbone_cfg': {'_target_': 'groot.vla.model.dreamzero.backbone.identity.IdentityBackbone'}, 'action_head_cfg': {'config': {'backbone_features_projector_cfg': None, '_target_': 'groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHeadConfig', '_recursive_': False, 'tiled': False, 'tile_size_height': 34, 'tile_size_width': 34, 'tile_stride_height': 18, 'tile_stride_width': 16, 'lora_rank': 4, 'lora_alpha': 4, 'num_frames': 33, 'num_frame_per_block': 2, 'lora_target_modules': 'q,k,v,o,ffn.0,ffn.2', 'init_lora_weights': 'kaiming', 'train_architecture': 'full', 'use_gradient_checkpointing': True, 'add_pos_embed': True, 'model_dtype': 'float32', 'max_state_dim': 64, 'max_action_dim': 36, 'action_loss_embodiment_ids': [26, 17, 32], 'hidden_size': 64, 'input_embedding_dim': 1536, 'backbone_embedding_dim': 0, 'repa_layer': 8, 'repa_coeff': 1.0, 'load_pretrained_det_decode_layer_path': None, 'freeze_decode_layer': False, 'expand_batch': None, 'use_vlln': True, 'vl_self_attention_cfg': {'_target_': 'groot.vla.model.n1_5.modules.cross_attention_dit.SelfAttentionTransformer', 'positional_embeddings': None, 'num_layers': 4, 'num_attention_heads': 24, 'attention_head_dim': 64, 'dropout': 0.2, 'final_dropout': True}, 'diffusion_model_cfg': {'_target_': 'groot.vla.model.dreamzero.modules.wan_video_dit_action_casual_chunk.CausalWanModel', '_convert_': 'object', 'diffusion_model_pretrained_path': '/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P', 'model_type': 'i2v', 'frame_seqlen': 220, 'dim': 5120, 'in_dim': 36, 'ffn_dim': 13824, 'out_dim': 16, 'freq_dim': 256, 'eps': 1e-06, 'num_heads': 40, 'num_layers': 40, 'max_chunk_size': 4, 'num_frame_per_block': 2, 'num_action_per_block': 24, 'num_state_per_block': 1, 'action_dim': 36}, 'text_encoder_cfg': {'_target_': 'groot.vla.model.dreamzero.modules.wan_video_text_encoder.WanTextEncoder', '_convert_': 'object', 'text_encoder_pretrained_path': '/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth'}, 'image_encoder_cfg': {'_target_': 'groot.vla.model.dreamzero.modules.wan_video_image_encoder.WanImageEncoder', '_convert_': 'object', 'image_encoder_pretrained_path': '/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'}, 'vae_cfg': {'_target_': 'groot.vla.model.dreamzero.modules.wan_video_vae.WanVideoVAE', '_convert_': 'object', 'vae_pretrained_path': '/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/Wan2.1_VAE.pth'}, 'action_dim': 36, 'action_horizon': 24, 'num_inference_timesteps': 4, 'noise_beta_alpha': 1.5, 'noise_beta_beta': 1.0, 'noise_s': 0.999, 'num_timestep_buckets': 1000, 'decouple_video_action_noise': False, 'video_noise_beta_alpha': 3.0, 'video_noise_beta_beta': 1.0, 'tune_projector': True, 'tune_diffusion_model': True, 'skip_component_loading': True}, '_target_': 'groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead', '_convert_': 'object'}, 'resume_path': './checkpoints/dreamzero_real_teleop_g1_full_finetune', 'model_type': 'vla', 'output_dir': './checkpoints/dreamzero_real_teleop_g1_full_finetune', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 64, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 1e-05, 'weight_decay': 1e-05, 'adam_beta1': 0.95, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1000, 'max_steps': 20000, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.05, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './checkpoints/dreamzero_real_teleop_g1_full_finetune/runs/Mar19_11-00-49_nebula100', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 10, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 8000, 'save_total_limit': 10, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': True, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 1, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'dreamzero_real_teleop_g1_full_finetune', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': True, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'tp_size': 0, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'groot/vla/configs/deepspeed/zero2_offload.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': 100, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': False, 'dataloader_persistent_workers': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
21
+ 2026-03-19 11:07:04,952 INFO MainThread:1976195 [wandb_config.py:__setitem__():155] [no run ID] config set model/num_parameters = 22924196696 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x76265c247450>>
22
+ 2026-03-19 11:07:04,952 INFO MainThread:1976195 [wandb_run.py:_config_callback():1403] config_cb model/num_parameters 22924196696 None
23
+ 2026-03-23 07:12:08,813 INFO wandb-AsyncioManager-main:1976195 [service_client.py:_forward_responses():134] Reached EOF.
24
+ 2026-03-23 07:12:08,814 INFO wandb-AsyncioManager-main:1976195 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/wandb/run-20260319_110703-lsme06f2/files/config.yaml ADDED
@@ -0,0 +1,651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _attn_implementation_autoset:
2
+ value: true
3
+ _name_or_path:
4
+ value: ./checkpoints/dreamzero_real_teleop_g1_full_finetune
5
+ _wandb:
6
+ value:
7
+ cli_version: 0.25.0
8
+ e:
9
+ w0fhsrag0fgp0xlyv0somzfdgnau53vg:
10
+ args:
11
+ - report_to=wandb
12
+ - data=dreamzero/real_teleop_g1_relative
13
+ - wandb_project=dreamzero
14
+ - train_architecture=full
15
+ - num_frames=33
16
+ - action_horizon=24
17
+ - num_views=1
18
+ - model=dreamzero/vla
19
+ - model/dreamzero/action_head=wan_flow_matching_action_tf
20
+ - model/dreamzero/transform=dreamzero_cotrain
21
+ - num_frame_per_block=2
22
+ - num_action_per_block=24
23
+ - num_state_per_block=1
24
+ - seed=42
25
+ - training_args.learning_rate=1e-5
26
+ - training_args.deepspeed=groot/vla/configs/deepspeed/zero2_offload.json
27
+ - save_steps=8000
28
+ - training_args.warmup_ratio=0.05
29
+ - output_dir=./checkpoints/dreamzero_real_teleop_g1_full_finetune
30
+ - per_device_train_batch_size=1
31
+ - max_steps=20000
32
+ - weight_decay=1e-5
33
+ - save_total_limit=10
34
+ - upload_checkpoints=false
35
+ - bf16=true
36
+ - tf32=true
37
+ - eval_bf16=true
38
+ - dataloader_pin_memory=false
39
+ - dataloader_num_workers=1
40
+ - image_resolution_width=320
41
+ - image_resolution_height=176
42
+ - save_lora_only=false
43
+ - max_chunk_size=4
44
+ - frame_seqlen=220
45
+ - save_strategy=steps
46
+ - real_teleop_g1_data_root=/hfm/boqian/liboqian_data/data/real_data/gear/g1/Pick_bottle_and_turn_and_pour_into_cup
47
+ - dit_version=/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P
48
+ - text_encoder_pretrained_path=/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth
49
+ - image_encoder_pretrained_path=/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth
50
+ - vae_pretrained_path=/hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/Wan2.1_VAE.pth
51
+ - tokenizer_path=/hfm/boqian/liboqian_data/checkpoints/umt5-xxl
52
+ - pretrained_model_path=/hfm/boqian/liboqian_data/checkpoints/DreamZero-AgiBot
53
+ - max_action_dim=36
54
+ - ++action_head_cfg.config.skip_component_loading=true
55
+ codePath: groot/vla/experiment/experiment.py
56
+ codePathLocal: groot/vla/experiment/experiment.py
57
+ cpu_count: 128
58
+ cpu_count_logical: 128
59
+ cudaVersion: "12.9"
60
+ disk:
61
+ /:
62
+ total: "3934171283456"
63
+ used: "3671307268096"
64
+ email: 2074894208@qq.com
65
+ executable: /hfm/boqian/miniconda3/envs/dreamzero/bin/python3.11
66
+ git:
67
+ commit: 71e794d8192d689c24bc5ae85a1bb28a3d423d2f
68
+ remote: git@github.com:boqian-li/DreamZero-private.git
69
+ gpu: NVIDIA A100-SXM4-80GB
70
+ gpu_count: 8
71
+ gpu_nvidia:
72
+ - architecture: Ampere
73
+ cudaCores: 6912
74
+ memoryTotal: "85899345920"
75
+ name: NVIDIA A100-SXM4-80GB
76
+ uuid: GPU-95155859-2785-a5fa-d493-0c9e9eb3332f
77
+ - architecture: Ampere
78
+ cudaCores: 6912
79
+ memoryTotal: "85899345920"
80
+ name: NVIDIA A100-SXM4-80GB
81
+ uuid: GPU-3829db0c-5c30-974b-5c88-9ed4f984eddf
82
+ - architecture: Ampere
83
+ cudaCores: 6912
84
+ memoryTotal: "85899345920"
85
+ name: NVIDIA A100-SXM4-80GB
86
+ uuid: GPU-0e260d0d-d121-6d8c-4c97-6ca0dfce506f
87
+ - architecture: Ampere
88
+ cudaCores: 6912
89
+ memoryTotal: "85899345920"
90
+ name: NVIDIA A100-SXM4-80GB
91
+ uuid: GPU-b99492cb-2347-0f80-e94f-a95e105eaffc
92
+ - architecture: Ampere
93
+ cudaCores: 6912
94
+ memoryTotal: "85899345920"
95
+ name: NVIDIA A100-SXM4-80GB
96
+ uuid: GPU-8a143efc-044d-2012-5820-afb258793ffd
97
+ - architecture: Ampere
98
+ cudaCores: 6912
99
+ memoryTotal: "85899345920"
100
+ name: NVIDIA A100-SXM4-80GB
101
+ uuid: GPU-9fedd571-c6c4-fd5a-1dd7-f82e048386ad
102
+ - architecture: Ampere
103
+ cudaCores: 6912
104
+ memoryTotal: "85899345920"
105
+ name: NVIDIA A100-SXM4-80GB
106
+ uuid: GPU-8cd44ee0-3344-490d-bb51-ba5b38f2b924
107
+ - architecture: Ampere
108
+ cudaCores: 6912
109
+ memoryTotal: "85899345920"
110
+ name: NVIDIA A100-SXM4-80GB
111
+ uuid: GPU-55a7cb99-494a-2d02-b9d8-6f85a0614e8c
112
+ host: nebula100
113
+ memory:
114
+ total: "1082011422720"
115
+ os: Linux-6.8.0-100-generic-x86_64-with-glibc2.39
116
+ program: /hfm/boqian/liboqian_code/DreamZero-private/groot/vla/experiment/experiment.py
117
+ python: CPython 3.11.14
118
+ root: ./checkpoints/dreamzero_real_teleop_g1_full_finetune
119
+ startedAt: "2026-03-19T11:07:03.311236Z"
120
+ writerId: w0fhsrag0fgp0xlyv0somzfdgnau53vg
121
+ m:
122
+ - "1": train/global_step
123
+ "6":
124
+ - 3
125
+ "7": []
126
+ - "2": '*'
127
+ "5": 1
128
+ "6":
129
+ - 1
130
+ "7": []
131
+ python_version: 3.11.14
132
+ t:
133
+ "1":
134
+ - 1
135
+ - 11
136
+ - 41
137
+ - 49
138
+ - 50
139
+ - 51
140
+ - 71
141
+ - 80
142
+ - 83
143
+ - 98
144
+ "2":
145
+ - 1
146
+ - 11
147
+ - 41
148
+ - 49
149
+ - 50
150
+ - 51
151
+ - 71
152
+ - 80
153
+ - 83
154
+ - 98
155
+ "3":
156
+ - 7
157
+ - 13
158
+ - 19
159
+ - 62
160
+ - 66
161
+ "4": 3.11.14
162
+ "5": 0.25.0
163
+ "6": 4.51.3
164
+ "9":
165
+ "1": transformers_trainer
166
+ "12": 0.25.0
167
+ "13": linux-x86_64
168
+ accelerator_config:
169
+ value:
170
+ dispatch_batches: null
171
+ even_batches: true
172
+ gradient_accumulation_kwargs: null
173
+ non_blocking: false
174
+ split_batches: false
175
+ use_seedable_sampler: true
176
+ action_dim:
177
+ value: 36
178
+ action_head_cfg:
179
+ value:
180
+ _convert_: object
181
+ _target_: groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHead
182
+ config:
183
+ _recursive_: false
184
+ _target_: groot.vla.model.dreamzero.action_head.wan_flow_matching_action_tf.WANPolicyHeadConfig
185
+ action_dim: 36
186
+ action_horizon: 24
187
+ action_loss_embodiment_ids:
188
+ - 26
189
+ - 17
190
+ - 32
191
+ add_pos_embed: true
192
+ backbone_embedding_dim: 0
193
+ backbone_features_projector_cfg: null
194
+ decouple_video_action_noise: false
195
+ diffusion_model_cfg:
196
+ _convert_: object
197
+ _target_: groot.vla.model.dreamzero.modules.wan_video_dit_action_casual_chunk.CausalWanModel
198
+ action_dim: 36
199
+ diffusion_model_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P
200
+ dim: 5120
201
+ eps: 1e-06
202
+ ffn_dim: 13824
203
+ frame_seqlen: 220
204
+ freq_dim: 256
205
+ in_dim: 36
206
+ max_chunk_size: 4
207
+ model_type: i2v
208
+ num_action_per_block: 24
209
+ num_frame_per_block: 2
210
+ num_heads: 40
211
+ num_layers: 40
212
+ num_state_per_block: 1
213
+ out_dim: 16
214
+ expand_batch: null
215
+ freeze_decode_layer: false
216
+ hidden_size: 64
217
+ image_encoder_cfg:
218
+ _convert_: object
219
+ _target_: groot.vla.model.dreamzero.modules.wan_video_image_encoder.WanImageEncoder
220
+ image_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth
221
+ init_lora_weights: kaiming
222
+ input_embedding_dim: 1536
223
+ load_pretrained_det_decode_layer_path: null
224
+ lora_alpha: 4
225
+ lora_rank: 4
226
+ lora_target_modules: q,k,v,o,ffn.0,ffn.2
227
+ max_action_dim: 36
228
+ max_state_dim: 64
229
+ model_dtype: float32
230
+ noise_beta_alpha: 1.5
231
+ noise_beta_beta: 1
232
+ noise_s: 0.999
233
+ num_frame_per_block: 2
234
+ num_frames: 33
235
+ num_inference_timesteps: 4
236
+ num_timestep_buckets: 1000
237
+ repa_coeff: 1
238
+ repa_layer: 8
239
+ skip_component_loading: true
240
+ text_encoder_cfg:
241
+ _convert_: object
242
+ _target_: groot.vla.model.dreamzero.modules.wan_video_text_encoder.WanTextEncoder
243
+ text_encoder_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth
244
+ tile_size_height: 34
245
+ tile_size_width: 34
246
+ tile_stride_height: 18
247
+ tile_stride_width: 16
248
+ tiled: false
249
+ train_architecture: full
250
+ tune_diffusion_model: true
251
+ tune_projector: true
252
+ use_gradient_checkpointing: true
253
+ use_vlln: true
254
+ vae_cfg:
255
+ _convert_: object
256
+ _target_: groot.vla.model.dreamzero.modules.wan_video_vae.WanVideoVAE
257
+ vae_pretrained_path: /hfm/boqian/liboqian_data/checkpoints/Wan2.1-I2V-14B-480P/Wan2.1_VAE.pth
258
+ video_noise_beta_alpha: 3
259
+ video_noise_beta_beta: 1
260
+ vl_self_attention_cfg:
261
+ _target_: groot.vla.model.n1_5.modules.cross_attention_dit.SelfAttentionTransformer
262
+ attention_head_dim: 64
263
+ dropout: 0.2
264
+ final_dropout: true
265
+ num_attention_heads: 24
266
+ num_layers: 4
267
+ positional_embeddings: null
268
+ action_horizon:
269
+ value: 24
270
+ adafactor:
271
+ value: false
272
+ adam_beta1:
273
+ value: 0.95
274
+ adam_beta2:
275
+ value: 0.999
276
+ adam_epsilon:
277
+ value: 1e-08
278
+ add_cross_attention:
279
+ value: false
280
+ architectures:
281
+ value: null
282
+ auto_find_batch_size:
283
+ value: false
284
+ average_tokens_across_devices:
285
+ value: false
286
+ backbone_cfg:
287
+ value:
288
+ _target_: groot.vla.model.dreamzero.backbone.identity.IdentityBackbone
289
+ bad_words_ids:
290
+ value: null
291
+ batch_eval_metrics:
292
+ value: false
293
+ begin_suppress_tokens:
294
+ value: null
295
+ bf16:
296
+ value: true
297
+ bf16_full_eval:
298
+ value: false
299
+ bos_token_id:
300
+ value: null
301
+ chunk_size_feed_forward:
302
+ value: 0
303
+ cross_attention_hidden_size:
304
+ value: null
305
+ data_seed:
306
+ value: null
307
+ dataloader_drop_last:
308
+ value: false
309
+ dataloader_num_workers:
310
+ value: 1
311
+ dataloader_persistent_workers:
312
+ value: true
313
+ dataloader_pin_memory:
314
+ value: false
315
+ dataloader_prefetch_factor:
316
+ value: null
317
+ ddp_backend:
318
+ value: null
319
+ ddp_broadcast_buffers:
320
+ value: null
321
+ ddp_bucket_cap_mb:
322
+ value: 100
323
+ ddp_find_unused_parameters:
324
+ value: false
325
+ ddp_timeout:
326
+ value: 1800
327
+ debug:
328
+ value: []
329
+ decoder_start_token_id:
330
+ value: null
331
+ deepspeed:
332
+ value: groot/vla/configs/deepspeed/zero2_offload.json
333
+ disable_tqdm:
334
+ value: false
335
+ diversity_penalty:
336
+ value: 0
337
+ do_eval:
338
+ value: false
339
+ do_predict:
340
+ value: false
341
+ do_sample:
342
+ value: false
343
+ do_train:
344
+ value: false
345
+ early_stopping:
346
+ value: false
347
+ encoder_no_repeat_ngram_size:
348
+ value: 0
349
+ eos_token_id:
350
+ value: null
351
+ eval_accumulation_steps:
352
+ value: null
353
+ eval_delay:
354
+ value: 0
355
+ eval_do_concat_batches:
356
+ value: true
357
+ eval_on_start:
358
+ value: false
359
+ eval_steps:
360
+ value: null
361
+ eval_strategy:
362
+ value: "no"
363
+ eval_use_gather_object:
364
+ value: false
365
+ exponential_decay_length_penalty:
366
+ value: null
367
+ finetuning_task:
368
+ value: null
369
+ forced_bos_token_id:
370
+ value: null
371
+ forced_eos_token_id:
372
+ value: null
373
+ fp16:
374
+ value: false
375
+ fp16_backend:
376
+ value: auto
377
+ fp16_full_eval:
378
+ value: false
379
+ fp16_opt_level:
380
+ value: O1
381
+ fsdp:
382
+ value: []
383
+ fsdp_config:
384
+ value:
385
+ min_num_params: 0
386
+ xla: false
387
+ xla_fsdp_grad_ckpt: false
388
+ xla_fsdp_v2: false
389
+ fsdp_min_num_params:
390
+ value: 0
391
+ fsdp_transformer_layer_cls_to_wrap:
392
+ value: null
393
+ full_determinism:
394
+ value: false
395
+ gradient_accumulation_steps:
396
+ value: 1
397
+ gradient_checkpointing:
398
+ value: false
399
+ gradient_checkpointing_kwargs:
400
+ value: null
401
+ greater_is_better:
402
+ value: null
403
+ group_by_length:
404
+ value: false
405
+ half_precision_backend:
406
+ value: auto
407
+ hidden_size:
408
+ value: 0
409
+ hub_always_push:
410
+ value: false
411
+ hub_model_id:
412
+ value: null
413
+ hub_private_repo:
414
+ value: null
415
+ hub_strategy:
416
+ value: every_save
417
+ hub_token:
418
+ value: <HUB_TOKEN>
419
+ id2label:
420
+ value:
421
+ "0": LABEL_0
422
+ "1": LABEL_1
423
+ ignore_data_skip:
424
+ value: true
425
+ include_for_metrics:
426
+ value: []
427
+ include_inputs_for_metrics:
428
+ value: false
429
+ include_num_input_tokens_seen:
430
+ value: false
431
+ include_tokens_per_second:
432
+ value: false
433
+ is_decoder:
434
+ value: false
435
+ is_encoder_decoder:
436
+ value: false
437
+ jit_mode_eval:
438
+ value: false
439
+ label_names:
440
+ value: null
441
+ label_smoothing_factor:
442
+ value: 0
443
+ label2id:
444
+ value:
445
+ LABEL_0: 0
446
+ LABEL_1: 1
447
+ learning_rate:
448
+ value: 1e-05
449
+ length_column_name:
450
+ value: length
451
+ length_penalty:
452
+ value: 1
453
+ load_best_model_at_end:
454
+ value: false
455
+ local_rank:
456
+ value: 0
457
+ log_level:
458
+ value: passive
459
+ log_level_replica:
460
+ value: warning
461
+ log_on_each_node:
462
+ value: true
463
+ logging_dir:
464
+ value: ./checkpoints/dreamzero_real_teleop_g1_full_finetune/runs/Mar19_11-00-49_nebula100
465
+ logging_first_step:
466
+ value: false
467
+ logging_nan_inf_filter:
468
+ value: true
469
+ logging_steps:
470
+ value: 10
471
+ logging_strategy:
472
+ value: steps
473
+ lr_scheduler_type:
474
+ value: cosine
475
+ max_grad_norm:
476
+ value: 1
477
+ max_length:
478
+ value: 20
479
+ max_steps:
480
+ value: 20000
481
+ metric_for_best_model:
482
+ value: null
483
+ min_length:
484
+ value: 0
485
+ model/num_parameters:
486
+ value: 22924196696
487
+ model_dtype:
488
+ value: float32
489
+ model_type:
490
+ value: vla
491
+ mp_parameters:
492
+ value: ""
493
+ neftune_noise_alpha:
494
+ value: null
495
+ no_cuda:
496
+ value: false
497
+ no_repeat_ngram_size:
498
+ value: 0
499
+ num_beam_groups:
500
+ value: 1
501
+ num_beams:
502
+ value: 1
503
+ num_return_sequences:
504
+ value: 1
505
+ num_train_epochs:
506
+ value: 1000
507
+ optim:
508
+ value: adamw_torch
509
+ optim_args:
510
+ value: null
511
+ optim_target_modules:
512
+ value: null
513
+ output_attentions:
514
+ value: false
515
+ output_dir:
516
+ value: ./checkpoints/dreamzero_real_teleop_g1_full_finetune
517
+ output_hidden_states:
518
+ value: false
519
+ output_scores:
520
+ value: false
521
+ overwrite_output_dir:
522
+ value: false
523
+ pad_token_id:
524
+ value: null
525
+ past_index:
526
+ value: -1
527
+ per_device_eval_batch_size:
528
+ value: 64
529
+ per_device_train_batch_size:
530
+ value: 1
531
+ per_gpu_eval_batch_size:
532
+ value: null
533
+ per_gpu_train_batch_size:
534
+ value: null
535
+ prediction_loss_only:
536
+ value: false
537
+ prefix:
538
+ value: null
539
+ problem_type:
540
+ value: null
541
+ push_to_hub:
542
+ value: false
543
+ push_to_hub_model_id:
544
+ value: null
545
+ push_to_hub_organization:
546
+ value: null
547
+ push_to_hub_token:
548
+ value: <PUSH_TO_HUB_TOKEN>
549
+ ray_scope:
550
+ value: last
551
+ remove_invalid_values:
552
+ value: false
553
+ remove_unused_columns:
554
+ value: false
555
+ repetition_penalty:
556
+ value: 1
557
+ report_to:
558
+ value:
559
+ - wandb
560
+ restore_callback_states_from_checkpoint:
561
+ value: false
562
+ resume_from_checkpoint:
563
+ value: null
564
+ resume_path:
565
+ value: ./checkpoints/dreamzero_real_teleop_g1_full_finetune
566
+ return_dict:
567
+ value: true
568
+ return_dict_in_generate:
569
+ value: false
570
+ run_name:
571
+ value: dreamzero_real_teleop_g1_full_finetune
572
+ save_on_each_node:
573
+ value: false
574
+ save_only_model:
575
+ value: false
576
+ save_safetensors:
577
+ value: true
578
+ save_steps:
579
+ value: 8000
580
+ save_strategy:
581
+ value: steps
582
+ save_total_limit:
583
+ value: 10
584
+ seed:
585
+ value: 42
586
+ sep_token_id:
587
+ value: null
588
+ skip_memory_metrics:
589
+ value: true
590
+ suppress_tokens:
591
+ value: null
592
+ task_specific_params:
593
+ value: null
594
+ temperature:
595
+ value: 1
596
+ tf_legacy_loss:
597
+ value: false
598
+ tf32:
599
+ value: true
600
+ tie_encoder_decoder:
601
+ value: false
602
+ tie_word_embeddings:
603
+ value: true
604
+ tokenizer_class:
605
+ value: null
606
+ top_k:
607
+ value: 50
608
+ top_p:
609
+ value: 1
610
+ torch_compile:
611
+ value: false
612
+ torch_compile_backend:
613
+ value: null
614
+ torch_compile_mode:
615
+ value: null
616
+ torch_dtype:
617
+ value: null
618
+ torch_empty_cache_steps:
619
+ value: null
620
+ torchdynamo:
621
+ value: null
622
+ torchscript:
623
+ value: false
624
+ tp_size:
625
+ value: 0
626
+ tpu_metrics_debug:
627
+ value: false
628
+ tpu_num_cores:
629
+ value: null
630
+ transformers_version:
631
+ value: 4.51.3
632
+ typical_p:
633
+ value: 1
634
+ use_bfloat16:
635
+ value: false
636
+ use_cpu:
637
+ value: false
638
+ use_ipex:
639
+ value: false
640
+ use_legacy_prediction_loop:
641
+ value: false
642
+ use_liger_kernel:
643
+ value: false
644
+ use_mps_device:
645
+ value: false
646
+ warmup_ratio:
647
+ value: 0.05
648
+ warmup_steps:
649
+ value: 0
650
+ weight_decay:
651
+ value: 1e-05
dreamzero_real_teleop_g1_full_finetune_relative_Pick_bottle_and_turn_and_pour_into_cup/wandb/run-20260319_110703-lsme06f2/files/output.log ADDED
The diff for this file is too large to render. See raw diff