| model: | |
| model_id: simple | |
| # Simple encoder config (spatial conv + temporal transformer) | |
| encoder: | |
| d_model: 512 | |
| n_heads: 8 | |
| n_layers: 12 | |
| max_seq_len: 64 # Maximum temporal sequence length | |
| train: | |
| model_id: simple | |
| trainer_id: basic | |
| data_dir: /mnt/data/waypoint_1/owl_control_1.1.x/kbm/fps | |
| target_size: [256, 256] | |
| window_length: 8 | |
| batch_size: 16 | |
| sample_data_dir: /mnt/data/waypoint_1/owl_control_1.1.x/kbm/fps | |
| n_samples: 8 | |
| sample_window_length: 128 | |
| epochs: 1000 | |
| opt: AdamW | |
| opt_kwargs: | |
| lr: 1.0e-4 | |
| betas: [0.9, 0.95] | |
| eps: 1.0e-15 | |
| weight_decay: 1.0e-2 | |
| checkpoint_dir: ./checkpoints/simple | |
| output_path: ./checkpoints/simple/ema | |
| resume_ckpt: latest | |
| sample_interval: 100 | |
| save_interval: 100 | |
| # Use log1p scaling for mouse inputs | |
| use_log1p_scaling: true | |
| logging: | |
| name: shahbuland | |
| project: owl-idm-v3 | |
| run_name: simple-v0 | |