File size: 2,446 Bytes
4a7aa47 58135f5 4a7aa47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | # @package _global_
# REFERENCE COPY — canonical is in this workstream's local lerobot clone at:
# ./lerobot/lerobot/configs/policy/act_diffusion_aloha_solo_real.yaml
# Single-arm (LEFT) ALOHA — Hybrid ACT+Diffusion policy.
# ACT encoder (ResNet18 + transformer) → DDIM diffusion U-Net → action chunks.
# 2 cameras: cam_left_wrist + cam_high. state_dim=action_dim=9.
# DOE winner: batch=24, lr=3e-5 (2026-04-20)
seed: 1000
dataset_repo_id: JHeisler/aloha_solo_left_4_6_26
override_dataset_stats:
observation.images.cam_left_wrist:
mean: [[[0.485]], [[0.456]], [[0.406]]]
std: [[[0.229]], [[0.224]], [[0.225]]]
observation.images.cam_high:
mean: [[[0.485]], [[0.456]], [[0.406]]]
std: [[[0.229]], [[0.224]], [[0.225]]]
use_amp: true
use_torch_compile: true
training:
offline_steps: 13400
online_steps: 0
eval_freq: -1
save_freq: 5000
log_freq: 100
save_checkpoint: true
batch_size: 24
lr: 3e-5
lr_backbone: 3e-5
lr_warmup_steps: 500
drop_n_last_frames: 2
weight_decay: 1e-4
grad_clip_norm: 10
online_steps_between_rollouts: 1
delta_timestamps:
action: "[i / ${fps} for i in range(${policy.chunk_size})]"
eval:
n_episodes: 50
batch_size: 50
policy:
name: hybrid_act_diffusion
n_obs_steps: 1
chunk_size: 100
n_action_steps: 100
input_shapes:
observation.images.cam_left_wrist: [3, 480, 640]
observation.images.cam_high: [3, 480, 640]
observation.state: ["${env.state_dim}"]
output_shapes:
action: ["${env.action_dim}"]
input_normalization_modes:
observation.images.cam_left_wrist: mean_std
observation.images.cam_high: mean_std
observation.state: mean_std
output_normalization_modes:
action: mean_std
# ACT visual encoder
vision_backbone: resnet18
pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
replace_final_stride_with_dilation: false
pre_norm: false
dim_model: 512
n_heads: 8
dim_feedforward: 3200
feedforward_activation: relu
n_encoder_layers: 4
dropout: 0.1
# Diffusion U-Net
down_dims: [256, 512]
kernel_size: 5
n_groups: 8
diffusion_step_embed_dim: 128
use_film_scale_modulation: true
# Noise scheduler
noise_scheduler_type: DDPM
num_train_timesteps: 100
beta_schedule: squaredcos_cap_v2
beta_start: 0.0001
beta_end: 0.02
prediction_type: epsilon
clip_sample: true
clip_sample_range: 1.0
num_inference_steps: 10
do_mask_loss_for_padding: true
|