arch: H_cycles: 2 L_cycles: 6 bptt: true forward_dtype: bfloat16 head_dim: 64 hidden_size: 512 intermediate_size: 2048 name: hrm@HRM norm_eps: 1.0e-06 num_layers: 2 rope_theta: 10000.0 beta1: 0.9 beta2: 0.95 cycles_per_data: 16 data: augment: true dataset_name: /sg-pretrain/datasets/sudoku-extreme-1k name: sudoku repeat: 200 ema: 0.999 epochs: 20 local_batch_size: 96 log_interval: 5 lr: 0.0001 lr_min_ratio: 1.0 lr_warmup_steps: 2000 weight_decay: 1.0