File size: 5,564 Bytes
ba10c7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
output_dir: ./runs/libero_uncond_2cam224_1e-4/2026-04-22_12x12_h100x2_trainableonly_resume_from2000_nw8_pin_on
batch_size: 32
num_workers: 8
pin_memory: true
lr_scheduler_type: cosine
learning_rate: 6.0e-05
num_epochs: 30
max_steps: null
log_every: 10
save_every: 0
eval_every: 1000
eval_num_inference_steps: 10
eval_enable_video: false
eval_save_video: false
eval_enable_action_metrics: true
pre_save_cleanup: true
pre_save_cleanup_sleep_seconds: 5.0
pre_save_cleanup_malloc_trim: true
gradient_accumulation_steps: 1
mixed_precision: bf16
seed: 42
max_grad_norm: 1.0
weight_decay: 0.01
resume: null
init_checkpoint: ./checkpoints/fastwam_release/libero_uncond_2cam224.pt
resume_training_state: ./runs/libero_uncond_2cam224_1e-4/2026-04-22_12x12_h100x2_trainableonly_resume_from2000_nw8_pin_on/checkpoints/latest_training.pt
checkpoint:
  policy: auto
  lightweight_resume_backend: trainable_only
  trainable_only_include_optimizer_state: false
  save_latest: true
  save_best_action_l1: true
  save_best_action_l2: true
wandb:
  enabled: false
  workspace: null
  project: fast-wam
  name: libero_12x12_trainableonly_resume_from65000_20260425
  group: null
  mode: online
data:
  train:
    _target_: fastwam.datasets.lerobot.robot_video_dataset.RobotVideoDataset
    dataset_dirs:
    - ./data/libero_mujoco3.3.2/libero_spatial_no_noops_lerobot
    - ./data/libero_mujoco3.3.2/libero_object_no_noops_lerobot
    - ./data/libero_mujoco3.3.2/libero_goal_no_noops_lerobot
    - ./data/libero_mujoco3.3.2/libero_10_no_noops_lerobot
    shape_meta:
      images:
      - key: image
        raw_shape:
        - 3
        - 512
        - 512
        shape:
        - 3
        - 224
        - 224
      - key: wrist_image
        raw_shape:
        - 3
        - 512
        - 512
        shape:
        - 3
        - 224
        - 224
      action:
      - key: default
        raw_shape: 7
        shape: 7
      state:
      - key: default
        raw_shape: 8
        shape: 8
    num_frames: 33
    global_sample_stride: 1
    action_video_freq_ratio: 4
    video_size:
    - 224
    - 448
    camera_key: null
    val_set_proportion: 0.0
    is_training_set: true
    skip_padding_as_possible: false
    concat_multi_camera: horizontal
    processor:
      _target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor
      shape_meta:
        images:
        - key: image
          raw_shape:
          - 3
          - 512
          - 512
          shape:
          - 3
          - 224
          - 224
        - key: wrist_image
          raw_shape:
          - 3
          - 512
          - 512
          shape:
          - 3
          - 224
          - 224
        action:
        - key: default
          raw_shape: 7
          shape: 7
        state:
        - key: default
          raw_shape: 8
          shape: 8
      num_obs_steps: 33
      num_output_cameras: 2
      action_output_dim: 7
      proprio_output_dim: 8
      delta_action_dim_mask:
        default:
        - true
        - true
        - true
        - true
        - true
        - true
        - false
      action_state_transforms: null
      use_stepwise_action_norm: false
      norm_default_mode: min/max
      norm_exception_mode: null
      action_state_merger:
        _target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign
      train_transforms:
      - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      val_transforms:
      - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
    text_embedding_cache_dir: ./data/text_embeds_cache/libero
    context_len: 128
model:
  _target_: fastwam.runtime.create_fastwam
  model_id: Wan-AI/Wan2.2-TI2V-5B
  tokenizer_model_id: Wan-AI/Wan2.1-T2V-1.3B
  tokenizer_max_len: 128
  load_text_encoder: false
  proprio_dim: 8
  redirect_common_files: true
  mot_checkpoint_mixed_attn: false
  action_dit_pretrained_path: checkpoints/ActionDiT_linear_interp_Wan22_alphascale_1024hdim.pt
  skip_dit_load_from_pretrain: false
  video_dit_config:
    has_image_input: false
    patch_size:
    - 1
    - 2
    - 2
    in_dim: 48
    hidden_dim: 3072
    ffn_dim: 14336
    freq_dim: 256
    text_dim: 4096
    out_dim: 48
    num_heads: 24
    attn_head_dim: 128
    num_layers: 30
    eps: 1.0e-06
    seperated_timestep: true
    require_clip_embedding: false
    require_vae_embedding: false
    fuse_vae_embedding_in_latents: true
    use_gradient_checkpointing: false
    video_attention_mask_mode: first_frame_causal
    action_conditioned: false
    action_dim: 7
    action_group_causal_mask_mode: group_diagonal
  action_dit_config:
    action_dim: 7
    hidden_dim: 1024
    ffn_dim: 4096
    num_heads: 24
    attn_head_dim: 128
    num_layers: 30
    text_dim: 4096
    freq_dim: 256
    eps: 1.0e-06
    use_gradient_checkpointing: false
  video_scheduler:
    train_shift: 5.0
    infer_shift: 5.0
    num_train_timesteps: 1000
  action_scheduler:
    train_shift: 5.0
    infer_shift: 5.0
    num_train_timesteps: 1000
  loss:
    lambda_video: 1.0
    lambda_action: 1.0
  pfd:
    enabled: true
    stage: s1
    training_mode: action512_partial
    adapter:
      type: mlp
      hidden_dim: 512
      depth: 3
      freq_dim: 256
    partial_unfreeze:
      action_last_layers: 12
      video_last_layers: 12
    lambda_gt: 1.0
    lambda_res: 0.5
    lambda_teacher: 0.1