output_dir: /datadrive/wjy/ckpt/fastwam_track_libero_0526 batch_size: 12 num_workers: 8 lr_scheduler_type: cosine learning_rate: 0.0001 num_epochs: 10 max_steps: 20000 log_every: 10 save_every: 2000 eval_every: 200 eval_num_inference_steps: 10 gradient_accumulation_steps: 1 mixed_precision: bf16 seed: 42 max_grad_norm: 1.0 weight_decay: 0.01 resume: null wandb: enabled: true workspace: null project: fast-wam name: libero_track_2cam224_1e-4 group: null mode: online data: train: _target_: fastwam.datasets.lerobot.track_robot_video_dataset.TrackRobotVideoDataset dataset_dirs: - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_spatial - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_object - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_goal - /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_10 track_episodes_file: /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/full_whitelist.txt shape_meta: images: - key: image raw_shape: - 3 - 512 - 512 shape: - 3 - 224 - 224 - key: wrist_image raw_shape: - 3 - 512 - 512 shape: - 3 - 224 - 224 action: - key: default raw_shape: 13 shape: 13 state: - key: default raw_shape: 8 shape: 8 num_frames: 33 global_sample_stride: 1 action_video_freq_ratio: 4 video_size: - 224 - 448 camera_key: null val_set_proportion: 0.0 is_training_set: true skip_padding_as_possible: false concat_multi_camera: horizontal processor: _target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor shape_meta: images: - key: image raw_shape: - 3 - 512 - 512 shape: - 3 - 224 - 224 - key: wrist_image raw_shape: - 3 - 512 - 512 shape: - 3 - 224 - 224 action: - key: default raw_shape: 13 shape: 13 state: - key: default raw_shape: 8 shape: 8 num_obs_steps: 33 num_output_cameras: 2 action_output_dim: 13 proprio_output_dim: 8 delta_action_dim_mask: default: - true - true - true - true - true - true - false - false - false - false - false - false - false action_state_transforms: null use_stepwise_action_norm: false norm_default_mode: min/max norm_exception_mode: null identity_dim_mask: action: default: - false - false - false - false - false - false - false - true - true - true - true - true - true action_state_merger: _target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign train_transforms: - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor - _target_: torchvision.transforms.Resize size: - 224 - 224 val_transforms: - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor - _target_: torchvision.transforms.Resize size: - 224 - 224 text_embedding_cache_dir: ./data/text_embeds_cache/libero context_len: 128 model: _target_: fastwam.runtime.create_fastwam_track model_id: Wan-AI/Wan2.2-TI2V-5B tokenizer_model_id: Wan-AI/Wan2.1-T2V-1.3B tokenizer_max_len: 128 load_text_encoder: false proprio_dim: 8 redirect_common_files: true mot_checkpoint_mixed_attn: false action_dit_pretrained_path: checkpoints/ActionDiT_linear_interp_Wan22_alphascale_1024hdim.pt skip_dit_load_from_pretrain: false video_dit_config: has_image_input: false patch_size: - 1 - 2 - 2 in_dim: 48 hidden_dim: 3072 ffn_dim: 14336 freq_dim: 256 text_dim: 4096 out_dim: 48 num_heads: 24 attn_head_dim: 128 num_layers: 30 eps: 1.0e-06 seperated_timestep: true require_clip_embedding: false require_vae_embedding: false fuse_vae_embedding_in_latents: true use_gradient_checkpointing: false video_attention_mask_mode: first_frame_causal action_conditioned: false action_dim: 13 action_group_causal_mask_mode: group_diagonal action_dit_config: action_dim: 13 hidden_dim: 1024 ffn_dim: 4096 num_heads: 24 attn_head_dim: 128 num_layers: 30 text_dim: 4096 freq_dim: 256 eps: 1.0e-06 use_gradient_checkpointing: false video_scheduler: train_shift: 5.0 infer_shift: 5.0 num_train_timesteps: 1000 action_scheduler: train_shift: 5.0 infer_shift: 5.0 num_train_timesteps: 1000 prediction_type: velocity loss: lambda_action: 1.0 lambda_track: 1.0 EVALUATION: flip_mode: vertical