### train_pose2rot_v7_ddp_b4.yaml — v6 recipe (memabl+tvar) at batch4 DDP-2gpu for 3.6x throughput ### name: Pose2Rot training runtime: device: cuda seed: 42 debug: false output: checkpoint_root: ./checkpoints/pose2rot experiment: exp: exp_pose2rot_v10_split_heldout model: target: models.v2.pose2rot.model.Pose2RotMemoryRestModel params: q_dim: 256 rest_layers: 4 pose_layers: 4 memory_layers: 4 decoder_layers: 10 num_heads: 8 joint_embed_dim: 768 temporal_window: 2 temporal_dropout: 0.1 decoder_cond_mode: add # add | concat pose_rest_film: true memory_rest_film: true decoder_rest_film: true pose_use_graph: true use_grad_checkpoint: false decoder_use_cross_layers: 0 # MEMORY ABLATION: no decoder cross-attn into memory bank (kill species-constant leakage) train: batch_size: 4 # DDP global batch = 4/gpu x 2 gpu = 8 epochs: 60 grad_accum_steps: 1 lr: 0.0002 # DDP 2-gpu global batch 8 at the proven-safe lr2e-4 (test if batch-doubling breaks anti-collapse) warmup_steps: 500 # linear LR warmup 0->8e-4 (codex: tame Adam startup at large scaled LR) max_ckpt: 100 num_workers_train: 6 # 6/proc x 2 proc = 12 of 16 cores test_every: 1 pretrain_ckpt: null loss: rot_loss_type: smooth_l1 vel_loss_type: smooth_l1 acc_loss_type: smooth_l1 weight: root_wt: 0.1 fk_wt: 10.0 # FK ramp END=10(用户按MoCapAnything; 比v8b的30温和, fk梯度~0.33<10 over epoch5-15, 之后恒10 vel_wt: 1.0 acc_wt: 1.0 # 用户要求加 acc(2阶时序平滑); 随机初始化量级~0.0095 rot_wt: 1.0 tvar_wt: 2.0 # demeaned-temporal supervision (force motion-tracking, anti-collapse) vis_every: 5 weight_decay: 0.0 eval: batch_size: 1 num_workers: 2 data: seq_len: 48 bvh_dir: datasets/zoo1030/bvh cache_scale: true limit_species_debug: [] mmap: true split_json: datasets/zoo1030/test_split_seen_rare_unseen.json train_memory_pkl_path: datasets/zoo1030/cache/species_fps_memory_yAll/fps_select_by_rot_32.pkl test_memory_pkl_path: datasets/zoo1030/cache/species_fps_memory_yAll/fps_select_by_rot_32.pkl