ae-ablation / base_const /base_const.yaml
hmdliu's picture
Upload folder using huggingface_hub
c6e1dae verified
model:
name: qwen3_phase1_noising_ln
pretrained_name_or_path: "Qwen/Qwen3-0.6B"
n_enc: 14
n_dec: 14
block_size: 32
task_prefix_length: 8
enc_attn_type: causal
latent_posid_mode: const
latent_mode: append
compression_ratios: [4, 8, 16]
latent_noising_thresh: 0.5
latent_noising_mode: uniform_rand
bottleneck_dim: 256
data:
hf_name: HuggingFaceFW/fineweb-edu
hf_subset: sample-100BT
hf_split: train
val_num_docs: 256
max_doc_tokens: 4096
shuffle_buffer_size: 50000
num_workers: 4
pin_memory: true
train:
exp_name: 100b_b32d3z256n05_append_const_ln
per_device_batch_size: 256
block_size: 32
max_steps: 500000
log_every: 100
eval_every: 10000
save_every: 50000
lr: 5.0e-5
weight_decay: 0.01
betas: [0.9, 0.99]
eps: 1.0e-6
clip_grad_norm: 1.0
warmup_steps: 1000
seed: 42
mixed_precision: bf16
ema_decay: 0.9999
checkpointing:
out_dir: /scratch/hl3797/compress-ar/src_qwen3/checkpoints
tokenizer:
tokenizer_name_or_path: "Qwen/Qwen3-0.6B"