# @package _global_ # REFERENCE COPY — canonical is in this workstream's local lerobot clone at: # ./lerobot/lerobot/configs/policy/act_diffusion_aloha_solo_real.yaml # Single-arm (LEFT) ALOHA — Hybrid ACT+Diffusion policy. # ACT encoder (ResNet18 + transformer) → DDIM diffusion U-Net → action chunks. # 2 cameras: cam_left_wrist + cam_high. state_dim=action_dim=9. # DOE winner: batch=24, lr=3e-5 (2026-04-20) seed: 1000 dataset_repo_id: JHeisler/aloha_solo_left_4_6_26 override_dataset_stats: observation.images.cam_left_wrist: mean: [[[0.485]], [[0.456]], [[0.406]]] std: [[[0.229]], [[0.224]], [[0.225]]] observation.images.cam_high: mean: [[[0.485]], [[0.456]], [[0.406]]] std: [[[0.229]], [[0.224]], [[0.225]]] use_amp: true use_torch_compile: true training: offline_steps: 40000 online_steps: 0 eval_freq: -1 save_freq: 10000 log_freq: 100 save_checkpoint: true batch_size: 28 lr: 3.5e-5 lr_backbone: 3.5e-5 lr_warmup_steps: 500 drop_n_last_frames: 2 weight_decay: 1e-4 grad_clip_norm: 10 online_steps_between_rollouts: 1 delta_timestamps: action: "[i / ${fps} for i in range(${policy.chunk_size})]" eval: n_episodes: 50 batch_size: 50 policy: name: hybrid_act_diffusion n_obs_steps: 1 chunk_size: 100 n_action_steps: 100 input_shapes: observation.images.cam_left_wrist: [3, 480, 640] observation.images.cam_high: [3, 480, 640] observation.state: ["${env.state_dim}"] output_shapes: action: ["${env.action_dim}"] input_normalization_modes: observation.images.cam_left_wrist: mean_std observation.images.cam_high: mean_std observation.state: mean_std output_normalization_modes: action: mean_std # ACT visual encoder vision_backbone: resnet18 pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1 replace_final_stride_with_dilation: false pre_norm: false dim_model: 512 n_heads: 8 dim_feedforward: 3200 feedforward_activation: relu n_encoder_layers: 4 dropout: 0.1 # Diffusion U-Net down_dims: [256, 512] kernel_size: 5 n_groups: 8 diffusion_step_embed_dim: 128 use_film_scale_modulation: true # Noise scheduler noise_scheduler_type: DDPM num_train_timesteps: 100 beta_schedule: squaredcos_cap_v2 beta_start: 0.0001 beta_end: 0.02 prediction_type: epsilon clip_sample: true clip_sample_range: 1.0 num_inference_steps: 10 do_mask_loss_for_padding: true