| output_dir: ./runs/libero_uncond_2cam224_1e-4/2026-04-22_12x12_h100x2_trainableonly_resume_from2000_nw8_pin_on |
| batch_size: 32 |
| num_workers: 8 |
| pin_memory: true |
| lr_scheduler_type: cosine |
| learning_rate: 6.0e-05 |
| num_epochs: 30 |
| max_steps: null |
| log_every: 10 |
| save_every: 0 |
| eval_every: 1000 |
| eval_num_inference_steps: 10 |
| eval_enable_video: false |
| eval_save_video: false |
| eval_enable_action_metrics: true |
| pre_save_cleanup: true |
| pre_save_cleanup_sleep_seconds: 5.0 |
| pre_save_cleanup_malloc_trim: true |
| gradient_accumulation_steps: 1 |
| mixed_precision: bf16 |
| seed: 42 |
| max_grad_norm: 1.0 |
| weight_decay: 0.01 |
| resume: null |
| init_checkpoint: ./checkpoints/fastwam_release/libero_uncond_2cam224.pt |
| resume_training_state: ./runs/libero_uncond_2cam224_1e-4/2026-04-22_12x12_h100x2_trainableonly_resume_from2000_nw8_pin_on/checkpoints/latest_training.pt |
| checkpoint: |
| policy: auto |
| lightweight_resume_backend: trainable_only |
| trainable_only_include_optimizer_state: false |
| save_latest: true |
| save_best_action_l1: true |
| save_best_action_l2: true |
| wandb: |
| enabled: false |
| workspace: null |
| project: fast-wam |
| name: libero_12x12_trainableonly_resume_from65000_20260425 |
| group: null |
| mode: online |
| data: |
| train: |
| _target_: fastwam.datasets.lerobot.robot_video_dataset.RobotVideoDataset |
| dataset_dirs: |
| - ./data/libero_mujoco3.3.2/libero_spatial_no_noops_lerobot |
| - ./data/libero_mujoco3.3.2/libero_object_no_noops_lerobot |
| - ./data/libero_mujoco3.3.2/libero_goal_no_noops_lerobot |
| - ./data/libero_mujoco3.3.2/libero_10_no_noops_lerobot |
| shape_meta: |
| images: |
| - key: image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| - key: wrist_image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| action: |
| - key: default |
| raw_shape: 7 |
| shape: 7 |
| state: |
| - key: default |
| raw_shape: 8 |
| shape: 8 |
| num_frames: 33 |
| global_sample_stride: 1 |
| action_video_freq_ratio: 4 |
| video_size: |
| - 224 |
| - 448 |
| camera_key: null |
| val_set_proportion: 0.0 |
| is_training_set: true |
| skip_padding_as_possible: false |
| concat_multi_camera: horizontal |
| processor: |
| _target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor |
| shape_meta: |
| images: |
| - key: image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| - key: wrist_image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| action: |
| - key: default |
| raw_shape: 7 |
| shape: 7 |
| state: |
| - key: default |
| raw_shape: 8 |
| shape: 8 |
| num_obs_steps: 33 |
| num_output_cameras: 2 |
| action_output_dim: 7 |
| proprio_output_dim: 8 |
| delta_action_dim_mask: |
| default: |
| - true |
| - true |
| - true |
| - true |
| - true |
| - true |
| - false |
| action_state_transforms: null |
| use_stepwise_action_norm: false |
| norm_default_mode: min/max |
| norm_exception_mode: null |
| action_state_merger: |
| _target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign |
| train_transforms: |
| - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor |
| - _target_: torchvision.transforms.Resize |
| size: |
| - 224 |
| - 224 |
| val_transforms: |
| - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor |
| - _target_: torchvision.transforms.Resize |
| size: |
| - 224 |
| - 224 |
| text_embedding_cache_dir: ./data/text_embeds_cache/libero |
| context_len: 128 |
| model: |
| _target_: fastwam.runtime.create_fastwam |
| model_id: Wan-AI/Wan2.2-TI2V-5B |
| tokenizer_model_id: Wan-AI/Wan2.1-T2V-1.3B |
| tokenizer_max_len: 128 |
| load_text_encoder: false |
| proprio_dim: 8 |
| redirect_common_files: true |
| mot_checkpoint_mixed_attn: false |
| action_dit_pretrained_path: checkpoints/ActionDiT_linear_interp_Wan22_alphascale_1024hdim.pt |
| skip_dit_load_from_pretrain: false |
| video_dit_config: |
| has_image_input: false |
| patch_size: |
| - 1 |
| - 2 |
| - 2 |
| in_dim: 48 |
| hidden_dim: 3072 |
| ffn_dim: 14336 |
| freq_dim: 256 |
| text_dim: 4096 |
| out_dim: 48 |
| num_heads: 24 |
| attn_head_dim: 128 |
| num_layers: 30 |
| eps: 1.0e-06 |
| seperated_timestep: true |
| require_clip_embedding: false |
| require_vae_embedding: false |
| fuse_vae_embedding_in_latents: true |
| use_gradient_checkpointing: false |
| video_attention_mask_mode: first_frame_causal |
| action_conditioned: false |
| action_dim: 7 |
| action_group_causal_mask_mode: group_diagonal |
| action_dit_config: |
| action_dim: 7 |
| hidden_dim: 1024 |
| ffn_dim: 4096 |
| num_heads: 24 |
| attn_head_dim: 128 |
| num_layers: 30 |
| text_dim: 4096 |
| freq_dim: 256 |
| eps: 1.0e-06 |
| use_gradient_checkpointing: false |
| video_scheduler: |
| train_shift: 5.0 |
| infer_shift: 5.0 |
| num_train_timesteps: 1000 |
| action_scheduler: |
| train_shift: 5.0 |
| infer_shift: 5.0 |
| num_train_timesteps: 1000 |
| loss: |
| lambda_video: 1.0 |
| lambda_action: 1.0 |
| pfd: |
| enabled: true |
| stage: s1 |
| training_mode: action512_partial |
| adapter: |
| type: mlp |
| hidden_dim: 512 |
| depth: 3 |
| freq_dim: 256 |
| partial_unfreeze: |
| action_last_layers: 12 |
| video_last_layers: 12 |
| lambda_gt: 1.0 |
| lambda_res: 0.5 |
| lambda_teacher: 0.1 |
|
|