MyFastWAM / config.yaml
emiliiia's picture
Add files using upload-large-folder tool
c37e0dd verified
output_dir: /datadrive/wjy/ckpt/fastwam_track_libero_0526
batch_size: 12
num_workers: 8
lr_scheduler_type: cosine
learning_rate: 0.0001
num_epochs: 10
max_steps: 20000
log_every: 10
save_every: 2000
eval_every: 200
eval_num_inference_steps: 10
gradient_accumulation_steps: 1
mixed_precision: bf16
seed: 42
max_grad_norm: 1.0
weight_decay: 0.01
resume: null
wandb:
enabled: true
workspace: null
project: fast-wam
name: libero_track_2cam224_1e-4
group: null
mode: online
data:
train:
_target_: fastwam.datasets.lerobot.track_robot_video_dataset.TrackRobotVideoDataset
dataset_dirs:
- /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_spatial
- /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_object
- /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_goal
- /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/libero_10
track_episodes_file: /datadrive2/wjy/dataset/LIBERO_fastwam_with_marker/full_whitelist.txt
shape_meta:
images:
- key: image
raw_shape:
- 3
- 512
- 512
shape:
- 3
- 224
- 224
- key: wrist_image
raw_shape:
- 3
- 512
- 512
shape:
- 3
- 224
- 224
action:
- key: default
raw_shape: 13
shape: 13
state:
- key: default
raw_shape: 8
shape: 8
num_frames: 33
global_sample_stride: 1
action_video_freq_ratio: 4
video_size:
- 224
- 448
camera_key: null
val_set_proportion: 0.0
is_training_set: true
skip_padding_as_possible: false
concat_multi_camera: horizontal
processor:
_target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor
shape_meta:
images:
- key: image
raw_shape:
- 3
- 512
- 512
shape:
- 3
- 224
- 224
- key: wrist_image
raw_shape:
- 3
- 512
- 512
shape:
- 3
- 224
- 224
action:
- key: default
raw_shape: 13
shape: 13
state:
- key: default
raw_shape: 8
shape: 8
num_obs_steps: 33
num_output_cameras: 2
action_output_dim: 13
proprio_output_dim: 8
delta_action_dim_mask:
default:
- true
- true
- true
- true
- true
- true
- false
- false
- false
- false
- false
- false
- false
action_state_transforms: null
use_stepwise_action_norm: false
norm_default_mode: min/max
norm_exception_mode: null
identity_dim_mask:
action:
default:
- false
- false
- false
- false
- false
- false
- false
- true
- true
- true
- true
- true
- true
action_state_merger:
_target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign
train_transforms:
- _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
- _target_: torchvision.transforms.Resize
size:
- 224
- 224
val_transforms:
- _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
- _target_: torchvision.transforms.Resize
size:
- 224
- 224
text_embedding_cache_dir: ./data/text_embeds_cache/libero
context_len: 128
model:
_target_: fastwam.runtime.create_fastwam_track
model_id: Wan-AI/Wan2.2-TI2V-5B
tokenizer_model_id: Wan-AI/Wan2.1-T2V-1.3B
tokenizer_max_len: 128
load_text_encoder: false
proprio_dim: 8
redirect_common_files: true
mot_checkpoint_mixed_attn: false
action_dit_pretrained_path: checkpoints/ActionDiT_linear_interp_Wan22_alphascale_1024hdim.pt
skip_dit_load_from_pretrain: false
video_dit_config:
has_image_input: false
patch_size:
- 1
- 2
- 2
in_dim: 48
hidden_dim: 3072
ffn_dim: 14336
freq_dim: 256
text_dim: 4096
out_dim: 48
num_heads: 24
attn_head_dim: 128
num_layers: 30
eps: 1.0e-06
seperated_timestep: true
require_clip_embedding: false
require_vae_embedding: false
fuse_vae_embedding_in_latents: true
use_gradient_checkpointing: false
video_attention_mask_mode: first_frame_causal
action_conditioned: false
action_dim: 13
action_group_causal_mask_mode: group_diagonal
action_dit_config:
action_dim: 13
hidden_dim: 1024
ffn_dim: 4096
num_heads: 24
attn_head_dim: 128
num_layers: 30
text_dim: 4096
freq_dim: 256
eps: 1.0e-06
use_gradient_checkpointing: false
video_scheduler:
train_shift: 5.0
infer_shift: 5.0
num_train_timesteps: 1000
action_scheduler:
train_shift: 5.0
infer_shift: 5.0
num_train_timesteps: 1000
prediction_type: velocity
loss:
lambda_action: 1.0
lambda_track: 1.0
EVALUATION:
flip_mode: vertical