model: name: qwen3_phase1_noising_ln pretrained_name_or_path: "Qwen/Qwen3-0.6B" n_enc: 14 n_dec: 14 block_size: 32 task_prefix_length: 8 enc_attn_type: causal latent_posid_mode: const latent_mode: append compression_ratios: [4, 8, 16] latent_noising_thresh: 0.5 latent_noising_mode: uniform_rand bottleneck_dim: 256 data: hf_name: HuggingFaceFW/fineweb-edu hf_subset: sample-100BT hf_split: train val_num_docs: 256 max_doc_tokens: 4096 shuffle_buffer_size: 50000 num_workers: 4 pin_memory: true train: exp_name: 100b_b32d3z256n05_append_const_ln per_device_batch_size: 256 block_size: 32 max_steps: 500000 log_every: 100 eval_every: 10000 save_every: 50000 lr: 5.0e-5 weight_decay: 0.01 betas: [0.9, 0.99] eps: 1.0e-6 clip_grad_norm: 1.0 warmup_steps: 1000 seed: 42 mixed_precision: bf16 ema_decay: 0.9999 checkpointing: out_dir: /scratch/hl3797/compress-ar/src_qwen3/checkpoints tokenizer: tokenizer_name_or_path: "Qwen/Qwen3-0.6B"