experiment: name: distill_dimo output_dir: ./experiments/distill_dimo log_every: 50 save_every: 1000 resume_iter: 0 training: seed: 42 mixed_precision: bf16 max_train_steps: 10000 gradient_accumulation_steps: 1 distill: teacher_ckpt: /gfs/space/private/fengzl/World_Model/URSA-1.7B prompt_source: /gfs/space/private/fengzl/World_Model/Koala-36M-v1 num_frames: 17 height: 320 width: 512 max_prompt_length: 320 batch_size_per_gpu: 1 lambda_kd: 0.5 lambda_pg: 1.0 lambda_ent: 0.01 tau: 1.0 tau_kd: 1.0 enable_teacher_cfg: true teacher_cfg_scale: 7.0 teacher_cfg_prob: 1.0 teacher_cfg_warmup_steps: 2000 teacher_cfg_trunc: 0.9 lambda_kd_uncond: 0.3 reward_use_guided: false fake_rounds: 1 use_surrogate_grad: false lambda_surr: 1.0 t_curriculum_steps: 10000 p_init_mix_ratio: 0.2 p_mix_corrupt_frac: 0.2 collapse_warn_frac: 0.2 aux_noise_std: 1.0e-05 grad_clip: 1.0 optimizer_student: target: torch.optim.AdamW params: lr: 1.0e-05 betas: - 0.9 - 0.95 weight_decay: 0.01 optimizer_aux: target: torch.optim.AdamW params: lr: 1.0e-05 betas: - 0.9 - 0.95 weight_decay: 0.01 lr_scheduler: target: diffnext.engine.lr_scheduler.CosineLR params: lr_max: ${optimizer_student.params.lr} lr_min: 1.0e-06 max_steps: ${training.max_train_steps} warmup_steps: 500 prompt_dataloader: shuffle_files: true shuffle_buffer: 50000 num_workers: 4 caption_field: caption config: ./configs/distill_dimo.yaml