| experiment: | |
| name: distill_dimo | |
| output_dir: ./experiments/distill_dimo | |
| log_every: 50 | |
| save_every: 1000 | |
| resume_iter: 0 | |
| training: | |
| seed: 42 | |
| mixed_precision: bf16 | |
| max_train_steps: 10000 | |
| gradient_accumulation_steps: 1 | |
| distill: | |
| teacher_ckpt: /gfs/space/private/fengzl/World_Model/URSA-1.7B | |
| prompt_source: /gfs/space/private/fengzl/World_Model/Koala-36M-v1 | |
| num_frames: 17 | |
| height: 320 | |
| width: 512 | |
| max_prompt_length: 320 | |
| batch_size_per_gpu: 1 | |
| lambda_kd: 0.5 | |
| lambda_pg: 1.0 | |
| lambda_ent: 0.01 | |
| tau: 1.0 | |
| tau_kd: 1.0 | |
| enable_teacher_cfg: true | |
| teacher_cfg_scale: 7.0 | |
| teacher_cfg_prob: 1.0 | |
| teacher_cfg_warmup_steps: 2000 | |
| teacher_cfg_trunc: 0.9 | |
| lambda_kd_uncond: 0.3 | |
| reward_use_guided: false | |
| fake_rounds: 1 | |
| use_surrogate_grad: false | |
| lambda_surr: 1.0 | |
| t_curriculum_steps: 10000 | |
| p_init_mix_ratio: 0.2 | |
| p_mix_corrupt_frac: 0.2 | |
| collapse_warn_frac: 0.2 | |
| aux_noise_std: 1.0e-05 | |
| grad_clip: 1.0 | |
| optimizer_student: | |
| target: torch.optim.AdamW | |
| params: | |
| lr: 1.0e-05 | |
| betas: | |
| - 0.9 | |
| - 0.95 | |
| weight_decay: 0.01 | |
| optimizer_aux: | |
| target: torch.optim.AdamW | |
| params: | |
| lr: 1.0e-05 | |
| betas: | |
| - 0.9 | |
| - 0.95 | |
| weight_decay: 0.01 | |
| lr_scheduler: | |
| target: diffnext.engine.lr_scheduler.CosineLR | |
| params: | |
| lr_max: ${optimizer_student.params.lr} | |
| lr_min: 1.0e-06 | |
| max_steps: ${training.max_train_steps} | |
| warmup_steps: 500 | |
| prompt_dataloader: | |
| shuffle_files: true | |
| shuffle_buffer: 50000 | |
| num_workers: 4 | |
| caption_field: caption | |
| config: ./configs/distill_dimo.yaml | |