PFD / config.yaml

Release PFD LIBERO 12x12 step62000 checkpoint

ba10c7d verified 4 days ago

5.56 kB

	output_dir: ./runs/libero_uncond_2cam224_1e-4/2026-04-22_12x12_h100x2_trainableonly_resume_from2000_nw8_pin_on
	batch_size: 32
	num_workers: 8
	pin_memory: true
	lr_scheduler_type: cosine
	learning_rate: 6.0e-05
	num_epochs: 30
	max_steps: null
	log_every: 10
	save_every: 0
	eval_every: 1000
	eval_num_inference_steps: 10
	eval_enable_video: false
	eval_save_video: false
	eval_enable_action_metrics: true
	pre_save_cleanup: true
	pre_save_cleanup_sleep_seconds: 5.0
	pre_save_cleanup_malloc_trim: true
	gradient_accumulation_steps: 1
	mixed_precision: bf16
	seed: 42
	max_grad_norm: 1.0
	weight_decay: 0.01
	resume: null
	init_checkpoint: ./checkpoints/fastwam_release/libero_uncond_2cam224.pt
	resume_training_state: ./runs/libero_uncond_2cam224_1e-4/2026-04-22_12x12_h100x2_trainableonly_resume_from2000_nw8_pin_on/checkpoints/latest_training.pt
	checkpoint:
	policy: auto
	lightweight_resume_backend: trainable_only
	trainable_only_include_optimizer_state: false
	save_latest: true
	save_best_action_l1: true
	save_best_action_l2: true
	wandb:
	enabled: false
	workspace: null
	project: fast-wam
	name: libero_12x12_trainableonly_resume_from65000_20260425
	group: null
	mode: online
	data:
	train:
	_target_: fastwam.datasets.lerobot.robot_video_dataset.RobotVideoDataset
	dataset_dirs:
	- ./data/libero_mujoco3.3.2/libero_spatial_no_noops_lerobot
	- ./data/libero_mujoco3.3.2/libero_object_no_noops_lerobot
	- ./data/libero_mujoco3.3.2/libero_goal_no_noops_lerobot
	- ./data/libero_mujoco3.3.2/libero_10_no_noops_lerobot
	shape_meta:
	images:
	- key: image
	raw_shape:
	- 3
	- 512
	- 512
	shape:
	- 3
	- 224
	- 224
	- key: wrist_image
	raw_shape:
	- 3
	- 512
	- 512
	shape:
	- 3
	- 224
	- 224
	action:
	- key: default
	raw_shape: 7
	shape: 7
	state:
	- key: default
	raw_shape: 8
	shape: 8
	num_frames: 33
	global_sample_stride: 1
	action_video_freq_ratio: 4
	video_size:
	- 224
	- 448
	camera_key: null
	val_set_proportion: 0.0
	is_training_set: true
	skip_padding_as_possible: false
	concat_multi_camera: horizontal
	processor:
	_target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor
	shape_meta:
	images:
	- key: image
	raw_shape:
	- 3
	- 512
	- 512
	shape:
	- 3
	- 224
	- 224
	- key: wrist_image
	raw_shape:
	- 3
	- 512
	- 512
	shape:
	- 3
	- 224
	- 224
	action:
	- key: default
	raw_shape: 7
	shape: 7
	state:
	- key: default
	raw_shape: 8
	shape: 8
	num_obs_steps: 33
	num_output_cameras: 2
	action_output_dim: 7
	proprio_output_dim: 8
	delta_action_dim_mask:
	default:
	- true
	- true
	- true
	- true
	- true
	- true
	- false
	action_state_transforms: null
	use_stepwise_action_norm: false
	norm_default_mode: min/max
	norm_exception_mode: null
	action_state_merger:
	_target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign
	train_transforms:
	- _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
	- _target_: torchvision.transforms.Resize
	size:
	- 224
	- 224
	val_transforms:
	- _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
	- _target_: torchvision.transforms.Resize
	size:
	- 224
	- 224
	text_embedding_cache_dir: ./data/text_embeds_cache/libero
	context_len: 128
	model:
	_target_: fastwam.runtime.create_fastwam
	model_id: Wan-AI/Wan2.2-TI2V-5B
	tokenizer_model_id: Wan-AI/Wan2.1-T2V-1.3B
	tokenizer_max_len: 128
	load_text_encoder: false
	proprio_dim: 8
	redirect_common_files: true
	mot_checkpoint_mixed_attn: false
	action_dit_pretrained_path: checkpoints/ActionDiT_linear_interp_Wan22_alphascale_1024hdim.pt
	skip_dit_load_from_pretrain: false
	video_dit_config:
	has_image_input: false
	patch_size:
	- 1
	- 2
	- 2
	in_dim: 48
	hidden_dim: 3072
	ffn_dim: 14336
	freq_dim: 256
	text_dim: 4096
	out_dim: 48
	num_heads: 24
	attn_head_dim: 128
	num_layers: 30
	eps: 1.0e-06
	seperated_timestep: true
	require_clip_embedding: false
	require_vae_embedding: false
	fuse_vae_embedding_in_latents: true
	use_gradient_checkpointing: false
	video_attention_mask_mode: first_frame_causal
	action_conditioned: false
	action_dim: 7
	action_group_causal_mask_mode: group_diagonal
	action_dit_config:
	action_dim: 7
	hidden_dim: 1024
	ffn_dim: 4096
	num_heads: 24
	attn_head_dim: 128
	num_layers: 30
	text_dim: 4096
	freq_dim: 256
	eps: 1.0e-06
	use_gradient_checkpointing: false
	video_scheduler:
	train_shift: 5.0
	infer_shift: 5.0
	num_train_timesteps: 1000
	action_scheduler:
	train_shift: 5.0
	infer_shift: 5.0
	num_train_timesteps: 1000
	loss:
	lambda_video: 1.0
	lambda_action: 1.0
	pfd:
	enabled: true
	stage: s1
	training_mode: action512_partial
	adapter:
	type: mlp
	hidden_dim: 512
	depth: 3
	freq_dim: 256
	partial_unfreeze:
	action_last_layers: 12
	video_last_layers: 12
	lambda_gt: 1.0
	lambda_res: 0.5
	lambda_teacher: 0.1