File size: 5,564 Bytes

ba10c7d

output_dir: ./runs/libero_uncond_2cam224_1e-4/2026-04-22_12x12_h100x2_trainableonly_resume_from2000_nw8_pin_on
batch_size: 32
num_workers: 8
pin_memory: true
lr_scheduler_type: cosine
learning_rate: 6.0e-05
num_epochs: 30
max_steps: null
log_every: 10
save_every: 0
eval_every: 1000
eval_num_inference_steps: 10
eval_enable_video: false
eval_save_video: false
eval_enable_action_metrics: true
pre_save_cleanup: true
pre_save_cleanup_sleep_seconds: 5.0
pre_save_cleanup_malloc_trim: true
gradient_accumulation_steps: 1
mixed_precision: bf16
seed: 42
max_grad_norm: 1.0
weight_decay: 0.01
resume: null
init_checkpoint: ./checkpoints/fastwam_release/libero_uncond_2cam224.pt
resume_training_state: ./runs/libero_uncond_2cam224_1e-4/2026-04-22_12x12_h100x2_trainableonly_resume_from2000_nw8_pin_on/checkpoints/latest_training.pt
checkpoint:
  policy: auto
  lightweight_resume_backend: trainable_only
  trainable_only_include_optimizer_state: false
  save_latest: true
  save_best_action_l1: true
  save_best_action_l2: true
wandb:
  enabled: false
  workspace: null
  project: fast-wam
  name: libero_12x12_trainableonly_resume_from65000_20260425
  group: null
  mode: online
data:
  train:
    _target_: fastwam.datasets.lerobot.robot_video_dataset.RobotVideoDataset
    dataset_dirs:
    - ./data/libero_mujoco3.3.2/libero_spatial_no_noops_lerobot
    - ./data/libero_mujoco3.3.2/libero_object_no_noops_lerobot
    - ./data/libero_mujoco3.3.2/libero_goal_no_noops_lerobot
    - ./data/libero_mujoco3.3.2/libero_10_no_noops_lerobot
    shape_meta:
      images:
      - key: image
        raw_shape:
        - 3
        - 512
        - 512
        shape:
        - 3
        - 224
        - 224
      - key: wrist_image
        raw_shape:
        - 3
        - 512
        - 512
        shape:
        - 3
        - 224
        - 224
      action:
      - key: default
        raw_shape: 7
        shape: 7
      state:
      - key: default
        raw_shape: 8
        shape: 8
    num_frames: 33
    global_sample_stride: 1
    action_video_freq_ratio: 4
    video_size:
    - 224
    - 448
    camera_key: null
    val_set_proportion: 0.0
    is_training_set: true
    skip_padding_as_possible: false
    concat_multi_camera: horizontal
    processor:
      _target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor
      shape_meta:
        images:
        - key: image
          raw_shape:
          - 3
          - 512
          - 512
          shape:
          - 3
          - 224
          - 224
        - key: wrist_image
          raw_shape:
          - 3
          - 512
          - 512
          shape:
          - 3
          - 224
          - 224
        action:
        - key: default
          raw_shape: 7
          shape: 7
        state:
        - key: default
          raw_shape: 8
          shape: 8
      num_obs_steps: 33
      num_output_cameras: 2
      action_output_dim: 7
      proprio_output_dim: 8
      delta_action_dim_mask:
        default:
        - true
        - true
        - true
        - true
        - true
        - true
        - false
      action_state_transforms: null
      use_stepwise_action_norm: false
      norm_default_mode: min/max
      norm_exception_mode: null
      action_state_merger:
        _target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign
      train_transforms:
      - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      val_transforms:
      - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
    text_embedding_cache_dir: ./data/text_embeds_cache/libero
    context_len: 128
model:
  _target_: fastwam.runtime.create_fastwam
  model_id: Wan-AI/Wan2.2-TI2V-5B
  tokenizer_model_id: Wan-AI/Wan2.1-T2V-1.3B
  tokenizer_max_len: 128
  load_text_encoder: false
  proprio_dim: 8
  redirect_common_files: true
  mot_checkpoint_mixed_attn: false
  action_dit_pretrained_path: checkpoints/ActionDiT_linear_interp_Wan22_alphascale_1024hdim.pt
  skip_dit_load_from_pretrain: false
  video_dit_config:
    has_image_input: false
    patch_size:
    - 1
    - 2
    - 2
    in_dim: 48
    hidden_dim: 3072
    ffn_dim: 14336
    freq_dim: 256
    text_dim: 4096
    out_dim: 48
    num_heads: 24
    attn_head_dim: 128
    num_layers: 30
    eps: 1.0e-06
    seperated_timestep: true
    require_clip_embedding: false
    require_vae_embedding: false
    fuse_vae_embedding_in_latents: true
    use_gradient_checkpointing: false
    video_attention_mask_mode: first_frame_causal
    action_conditioned: false
    action_dim: 7
    action_group_causal_mask_mode: group_diagonal
  action_dit_config:
    action_dim: 7
    hidden_dim: 1024
    ffn_dim: 4096
    num_heads: 24
    attn_head_dim: 128
    num_layers: 30
    text_dim: 4096
    freq_dim: 256
    eps: 1.0e-06
    use_gradient_checkpointing: false
  video_scheduler:
    train_shift: 5.0
    infer_shift: 5.0
    num_train_timesteps: 1000
  action_scheduler:
    train_shift: 5.0
    infer_shift: 5.0
    num_train_timesteps: 1000
  loss:
    lambda_video: 1.0
    lambda_action: 1.0
  pfd:
    enabled: true
    stage: s1
    training_mode: action512_partial
    adapter:
      type: mlp
      hidden_dim: 512
      depth: 3
      freq_dim: 256
    partial_unfreeze:
      action_last_layers: 12
      video_last_layers: 12
    lambda_gt: 1.0
    lambda_res: 0.5
    lambda_teacher: 0.1