aloha_solo_left_act_diffusion_40k / training_config_source.yaml
JHeisler's picture
Add Hydra source training config
f2eaf0a verified
# @package _global_
# REFERENCE COPY — canonical is in this workstream's local lerobot clone at:
# ./lerobot/lerobot/configs/policy/act_diffusion_aloha_solo_real.yaml
# Single-arm (LEFT) ALOHA — Hybrid ACT+Diffusion policy.
# ACT encoder (ResNet18 + transformer) → DDIM diffusion U-Net → action chunks.
# 2 cameras: cam_left_wrist + cam_high. state_dim=action_dim=9.
# DOE winner: batch=24, lr=3e-5 (2026-04-20)
seed: 1000
dataset_repo_id: JHeisler/aloha_solo_left_4_6_26
override_dataset_stats:
observation.images.cam_left_wrist:
mean: [[[0.485]], [[0.456]], [[0.406]]]
std: [[[0.229]], [[0.224]], [[0.225]]]
observation.images.cam_high:
mean: [[[0.485]], [[0.456]], [[0.406]]]
std: [[[0.229]], [[0.224]], [[0.225]]]
use_amp: true
use_torch_compile: true
training:
offline_steps: 40000
online_steps: 0
eval_freq: -1
save_freq: 10000
log_freq: 100
save_checkpoint: true
batch_size: 28
lr: 3.5e-5
lr_backbone: 3.5e-5
lr_warmup_steps: 500
drop_n_last_frames: 2
weight_decay: 1e-4
grad_clip_norm: 10
online_steps_between_rollouts: 1
delta_timestamps:
action: "[i / ${fps} for i in range(${policy.chunk_size})]"
eval:
n_episodes: 50
batch_size: 50
policy:
name: hybrid_act_diffusion
n_obs_steps: 1
chunk_size: 100
n_action_steps: 100
input_shapes:
observation.images.cam_left_wrist: [3, 480, 640]
observation.images.cam_high: [3, 480, 640]
observation.state: ["${env.state_dim}"]
output_shapes:
action: ["${env.action_dim}"]
input_normalization_modes:
observation.images.cam_left_wrist: mean_std
observation.images.cam_high: mean_std
observation.state: mean_std
output_normalization_modes:
action: mean_std
# ACT visual encoder
vision_backbone: resnet18
pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
replace_final_stride_with_dilation: false
pre_norm: false
dim_model: 512
n_heads: 8
dim_feedforward: 3200
feedforward_activation: relu
n_encoder_layers: 4
dropout: 0.1
# Diffusion U-Net
down_dims: [256, 512]
kernel_size: 5
n_groups: 8
diffusion_step_embed_dim: 128
use_film_scale_modulation: true
# Noise scheduler
noise_scheduler_type: DDPM
num_train_timesteps: 100
beta_schedule: squaredcos_cap_v2
beta_start: 0.0001
beta_end: 0.02
prediction_type: epsilon
clip_sample: true
clip_sample_range: 1.0
num_inference_steps: 10
do_mask_loss_for_padding: true