Spaces:
No application file
No application file
| # Training mode configuration | |
| training_mode: 'llada' # 'llada' or 'dream' | |
| # Model and data path configuration | |
| paths: | |
| model: 'GSAI-ML/LLaDA-8B-Instruct' | |
| experiment: 'ckpt_llada_instruct' | |
| data: | |
| bs: 'Lansechen/bs17k_collection_filtered_hard_maxlength600' | |
| bs_easy: 'Lansechen/bs17k_collection_filtered_easy_maxlength600' | |
| denoiser: | |
| encoder: | |
| name: 'dream' | |
| mask_id: 151666 | |
| decoder: | |
| wiinit: true | |
| name: 'eagle_rope' | |
| num_blocks: 1 | |
| seq_len: &seq_len 1024 | |
| input_dim: 3584 | |
| hidden_dim: &dim 3584 | |
| vocab_size: 152064 | |
| block: | |
| seq_len: | |
| hidden_dim: | |
| num_heads: 32 | |
| train: | |
| # Will use paths.experiment path | |
| decoder_resume_path: | |
| head_resume_path: | |
| skipped_keys: | |
| global_step: | |
| exp_name: &exp_name 'llada_ddt_maskteacher' | |
| wandb_proj: | |
| output_dir: 'ddt_test' | |
| logging_dir: 'logs' | |
| mixed_precision: 'fp16' | |
| gradient_accumulation_steps: 5 | |
| report_to: 'wandb' | |
| block_size: 16 | |
| lr: 1e-5 | |
| num_iters: 50000 | |
| eval_every: 100000 | |
| save_every: 1000 | |
| enable_shift: true | |
| share_steps: 2 | |
| self_align: true | |
| feature_align: false | |
| self_step: true | |
| data: | |
| name: 'bs17k' #['numinamath', 'bs17k'] | |
| batch_size: 1 | |
| max_length: | |