output_dir: /datadrive/wjy/ckpt/fastwam_track_libero_0526
batch_size: 12
num_workers: 8
lr_scheduler_type: cosine
learning_rate: 0.0001
num_epochs: 10
max_steps: 20000
log_every: 10
save_every: 2000
eval_every: 200
eval_num_inference_steps: 10
gradient_accumulation_steps: 1
mixed_precision: bf16
seed: 42
max_grad_norm: 1.0
weight_decay: 0.01
resume: null
wandb:
  enabled: true
  workspace: null
  project: fast-wam
  name: libero_track_2cam224_1e-4
  group: null
  mode: online
data:
  train:
    _target_: fastwam.datasets.lerobot.track_robot_video_dataset.TrackRobotVideoDataset
    dataset_dirs:
    - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_spatial
    - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_object
    - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_goal
    - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_10
    track_episodes_file: /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/full_whitelist.txt
    shape_meta:
      images:
      - key: image
        raw_shape:
        - 3
        - 512
        - 512
        shape:
        - 3
        - 224
        - 224
      - key: wrist_image
        raw_shape:
        - 3
        - 512
        - 512
        shape:
        - 3
        - 224
        - 224
      action:
      - key: default
        raw_shape: 13
        shape: 13
      state:
      - key: default
        raw_shape: 8
        shape: 8
    num_frames: 33
    global_sample_stride: 1
    action_video_freq_ratio: 4
    video_size:
    - 224
    - 448
    camera_key: null
    val_set_proportion: 0.0
    is_training_set: true
    skip_padding_as_possible: false
    concat_multi_camera: horizontal
    processor:
      _target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor
      shape_meta:
        images:
        - key: image
          raw_shape:
          - 3
          - 512
          - 512
          shape:
          - 3
          - 224
          - 224
        - key: wrist_image
          raw_shape:
          - 3
          - 512
          - 512
          shape:
          - 3
          - 224
          - 224
        action:
        - key: default
          raw_shape: 13
          shape: 13
        state:
        - key: default
          raw_shape: 8
          shape: 8
      num_obs_steps: 33
      num_output_cameras: 2
      action_output_dim: 13
      proprio_output_dim: 8
      delta_action_dim_mask:
        default:
        - true
        - true
        - true
        - true
        - true
        - true
        - false
        - false
        - false
        - false
        - false
        - false
        - false
      action_state_transforms: null
      use_stepwise_action_norm: false
      norm_default_mode: min/max
      norm_exception_mode: null
      identity_dim_mask:
        action:
          default:
          - false
          - false
          - false
          - false
          - false
          - false
          - false
          - true
          - true
          - true
          - true
          - true
          - true
      action_state_merger:
        _target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign
      train_transforms:
      - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      val_transforms:
      - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
    text_embedding_cache_dir: ./data/text_embeds_cache/libero
    context_len: 128
model:
  _target_: fastwam.runtime.create_fastwam_track
  model_id: Wan-AI/Wan2.2-TI2V-5B
  tokenizer_model_id: Wan-AI/Wan2.1-T2V-1.3B
  tokenizer_max_len: 128
  load_text_encoder: false
  proprio_dim: 8
  redirect_common_files: true
  mot_checkpoint_mixed_attn: false
  action_dit_pretrained_path: checkpoints/ActionDiT_linear_interp_Wan22_alphascale_1024hdim.pt
  skip_dit_load_from_pretrain: false
  video_dit_config:
    has_image_input: false
    patch_size:
    - 1
    - 2
    - 2
    in_dim: 48
    hidden_dim: 3072
    ffn_dim: 14336
    freq_dim: 256
    text_dim: 4096
    out_dim: 48
    num_heads: 24
    attn_head_dim: 128
    num_layers: 30
    eps: 1.0e-06
    seperated_timestep: true
    require_clip_embedding: false
    require_vae_embedding: false
    fuse_vae_embedding_in_latents: true
    use_gradient_checkpointing: false
    video_attention_mask_mode: first_frame_causal
    action_conditioned: false
    action_dim: 13
    action_group_causal_mask_mode: group_diagonal
  action_dit_config:
    action_dim: 13
    hidden_dim: 1024
    ffn_dim: 4096
    num_heads: 24
    attn_head_dim: 128
    num_layers: 30
    text_dim: 4096
    freq_dim: 256
    eps: 1.0e-06
    use_gradient_checkpointing: false
  video_scheduler:
    train_shift: 5.0
    infer_shift: 5.0
    num_train_timesteps: 1000
  action_scheduler:
    train_shift: 5.0
    infer_shift: 5.0
    num_train_timesteps: 1000
    prediction_type: velocity
  loss:
    lambda_action: 1.0
    lambda_track: 1.0
EVALUATION:
  flip_mode: vertical