| output_dir: /datadrive/wjy/ckpt/fastwam_track_libero_0526 |
| batch_size: 12 |
| num_workers: 8 |
| lr_scheduler_type: cosine |
| learning_rate: 0.0001 |
| num_epochs: 10 |
| max_steps: 20000 |
| log_every: 10 |
| save_every: 2000 |
| eval_every: 200 |
| eval_num_inference_steps: 10 |
| gradient_accumulation_steps: 1 |
| mixed_precision: bf16 |
| seed: 42 |
| max_grad_norm: 1.0 |
| weight_decay: 0.01 |
| resume: null |
| wandb: |
| enabled: true |
| workspace: null |
| project: fast-wam |
| name: libero_track_2cam224_1e-4 |
| group: null |
| mode: online |
| data: |
| train: |
| _target_: fastwam.datasets.lerobot.track_robot_video_dataset.TrackRobotVideoDataset |
| dataset_dirs: |
| - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_spatial |
| - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_object |
| - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_goal |
| - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_10 |
| track_episodes_file: /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/full_whitelist.txt |
| shape_meta: |
| images: |
| - key: image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| - key: wrist_image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| action: |
| - key: default |
| raw_shape: 13 |
| shape: 13 |
| state: |
| - key: default |
| raw_shape: 8 |
| shape: 8 |
| num_frames: 33 |
| global_sample_stride: 1 |
| action_video_freq_ratio: 4 |
| video_size: |
| - 224 |
| - 448 |
| camera_key: null |
| val_set_proportion: 0.0 |
| is_training_set: true |
| skip_padding_as_possible: false |
| concat_multi_camera: horizontal |
| processor: |
| _target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor |
| shape_meta: |
| images: |
| - key: image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| - key: wrist_image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| action: |
| - key: default |
| raw_shape: 13 |
| shape: 13 |
| state: |
| - key: default |
| raw_shape: 8 |
| shape: 8 |
| num_obs_steps: 33 |
| num_output_cameras: 2 |
| action_output_dim: 13 |
| proprio_output_dim: 8 |
| delta_action_dim_mask: |
| default: |
| - true |
| - true |
| - true |
| - true |
| - true |
| - true |
| - false |
| - false |
| - false |
| - false |
| - false |
| - false |
| - false |
| action_state_transforms: null |
| use_stepwise_action_norm: false |
| norm_default_mode: min/max |
| norm_exception_mode: null |
| identity_dim_mask: |
| action: |
| default: |
| - false |
| - false |
| - false |
| - false |
| - false |
| - false |
| - false |
| - true |
| - true |
| - true |
| - true |
| - true |
| - true |
| action_state_merger: |
| _target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign |
| train_transforms: |
| - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor |
| - _target_: torchvision.transforms.Resize |
| size: |
| - 224 |
| - 224 |
| val_transforms: |
| - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor |
| - _target_: torchvision.transforms.Resize |
| size: |
| - 224 |
| - 224 |
| text_embedding_cache_dir: ./data/text_embeds_cache/libero |
| context_len: 128 |
| model: |
| _target_: fastwam.runtime.create_fastwam_track |
| model_id: Wan-AI/Wan2.2-TI2V-5B |
| tokenizer_model_id: Wan-AI/Wan2.1-T2V-1.3B |
| tokenizer_max_len: 128 |
| load_text_encoder: false |
| proprio_dim: 8 |
| redirect_common_files: true |
| mot_checkpoint_mixed_attn: false |
| action_dit_pretrained_path: checkpoints/ActionDiT_linear_interp_Wan22_alphascale_1024hdim.pt |
| skip_dit_load_from_pretrain: false |
| video_dit_config: |
| has_image_input: false |
| patch_size: |
| - 1 |
| - 2 |
| - 2 |
| in_dim: 48 |
| hidden_dim: 3072 |
| ffn_dim: 14336 |
| freq_dim: 256 |
| text_dim: 4096 |
| out_dim: 48 |
| num_heads: 24 |
| attn_head_dim: 128 |
| num_layers: 30 |
| eps: 1.0e-06 |
| seperated_timestep: true |
| require_clip_embedding: false |
| require_vae_embedding: false |
| fuse_vae_embedding_in_latents: true |
| use_gradient_checkpointing: false |
| video_attention_mask_mode: first_frame_causal |
| action_conditioned: false |
| action_dim: 13 |
| action_group_causal_mask_mode: group_diagonal |
| action_dit_config: |
| action_dim: 13 |
| hidden_dim: 1024 |
| ffn_dim: 4096 |
| num_heads: 24 |
| attn_head_dim: 128 |
| num_layers: 30 |
| text_dim: 4096 |
| freq_dim: 256 |
| eps: 1.0e-06 |
| use_gradient_checkpointing: false |
| video_scheduler: |
| train_shift: 5.0 |
| infer_shift: 5.0 |
| num_train_timesteps: 1000 |
| action_scheduler: |
| train_shift: 5.0 |
| infer_shift: 5.0 |
| num_train_timesteps: 1000 |
| prediction_type: velocity |
| loss: |
| lambda_action: 1.0 |
| lambda_track: 1.0 |
| EVALUATION: |
| flip_mode: vertical |
|
|