File size: 3,382 Bytes
e94400c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# PI0 训练配置 - 使用 unified 37D action 表示
# action/state 投影层(原 openpi 硬编码 32D)会在 PI0Framework 初始化时自动替换为 37D,
# checkpoint 中对应的 32D 参数加载时自动跳过,其余 backbone 参数正常复用。

run_id: pi0_unified_37d
run_root_dir: ./runs
seed: 42
trackers: [jsonl, wandb]
wandb_entity: timsty
wandb_project: vla_jepa
is_debug: false

framework:
  name: PI0
  # PI0 模型配置
  # action_dim 以本项目为准(统一 37D unified action 表示)。
  # PI0Pytorch 源码中 action_in_proj / action_out_proj / state_proj 硬编码为 32D,
  # PI0Framework.__init__ 会调用 _replace_pi0_projection_layers 将其替换为 37D,
  # 加载 checkpoint 时这些层因 shape 不匹配会自动跳过(保持随机初始化)。
  # 其余 VLM backbone 层(PaliGemma、action expert transformer 等)仍正常从 checkpoint 加载。
  pi0:
    paligemma_variant: "gemma_2b"
    action_expert_variant: "gemma_300m"
    pi05: false
    action_dim: 37          # 项目统一维度;投影层会被自动替换,checkpoint 同维度参数跳过加载
    state_dim: 74           # unified state 维度;state_proj 替换为 Linear(74, width),与 action_dim 独立
    action_horizon: 15      # 与 chunk_size 对齐
    dtype: "bfloat16"

  # 预训练权重路径(pi05_libero 等,action_dim 不匹配时会 strict=False 部分加载)
  pi0_checkpoint: /mnt/data/fangyu/model/openpi/openpi-assets/checkpoints/pi0_base_torch/model.pt

  # PaliGemma tokenizer
  tokenizer_path: /root/.cache/openpi/big_vision/paligemma_tokenizer.model

  # 图像键名,与 openpi 三视角格式对应;gr1 单视角时配合 replicate_single_view
  image_keys:
    - "base_0_rgb"
    - "left_wrist_0_rgb"
    - "right_wrist_0_rgb"

  # 当 dataset 仅提供 1 张图时复制到 3 视角(如 fourier_gr1 video.ego_view)
  replicate_single_view: true

  use_state: true

  # 若 true,根据实际图像数量动态使用 image_keys 的前 N 个;否则固定全部 keys,不足补零
  dynamic_image_keys: false

  num_inference_steps: 10

  # 输出截断维度,null 表示输出完整 action_dim
  effective_action_dim: null

datasets:
  vla_data:
    dataset_py: lerobot_datasets
    data_root_dir: /mnt/data/fangyu/dataset/IPEC-COMMUNITY
    data_mix: cross_embodiedment_simulator
    default_image_resolution: [3, 224, 224]
    per_device_batch_size: 32
    load_all_data_for_training: true
    obs: ["image_0"]
    image_size: [224, 224]
    video_backend: torchcodec
    load_video: true
    chunk_size: 15
    state_use_action_chunk: false
    num_history_steps: 0
    include_state: false   # 训练 PI0 时不使用 state

trainer:
  epochs: 100
  max_train_steps: 20000
  num_warmup_steps: 5000
  num_stable_steps: 0
  save_interval: 5000
  max_checkpoints_to_keep: 20

  learning_rate:
    base: 2.5e-5
    pi0_model: 2.5e-5

  lr_scheduler_type: warmup_stable_cosine
  scheduler_specific_kwargs:
    min_lr_ratio: 0.001

  freeze_modules: ""
  warmup_ratio: 0.1
  weight_decay: 0.0
  logging_frequency: 10
  gradient_clipping: 5.0
  gradient_accumulation_steps: 1

  optimizer:
    name: AdamW
    betas: [0.9, 0.95]
    eps: 1.0e-08
    weight_decay: 1.0e-08

  is_resume: false
  pretrained_checkpoint: null
  enable_gradient_checkpointing: false
  enable_mixed_precision_training: true