| model: | |
| name: qwen3_phase1_noising_ln | |
| pretrained_name_or_path: "Qwen/Qwen3-0.6B" | |
| n_enc: 14 | |
| n_dec: 14 | |
| block_size: 32 | |
| task_prefix_length: 8 | |
| enc_attn_type: causal | |
| latent_posid_mode: const | |
| latent_mode: append | |
| compression_ratios: [4, 8, 16] | |
| latent_noising_thresh: 0.5 | |
| latent_noising_mode: uniform_rand | |
| bottleneck_dim: 256 | |
| data: | |
| hf_name: HuggingFaceFW/fineweb-edu | |
| hf_subset: sample-100BT | |
| hf_split: train | |
| val_num_docs: 256 | |
| max_doc_tokens: 4096 | |
| shuffle_buffer_size: 50000 | |
| num_workers: 4 | |
| pin_memory: true | |
| train: | |
| exp_name: 100b_b32d3z256n05_append_const_ln | |
| per_device_batch_size: 256 | |
| block_size: 32 | |
| max_steps: 500000 | |
| log_every: 100 | |
| eval_every: 10000 | |
| save_every: 50000 | |
| lr: 5.0e-5 | |
| weight_decay: 0.01 | |
| betas: [0.9, 0.99] | |
| eps: 1.0e-6 | |
| clip_grad_norm: 1.0 | |
| warmup_steps: 1000 | |
| seed: 42 | |
| mixed_precision: bf16 | |
| ema_decay: 0.9999 | |
| checkpointing: | |
| out_dir: /scratch/hl3797/compress-ar/src_qwen3/checkpoints | |
| tokenizer: | |
| tokenizer_name_or_path: "Qwen/Qwen3-0.6B" | |