hmdliu
/

ae-ablation

Model card Files Files and versions

ae-ablation / base_const /base_const.yaml

hmdliu's picture

Upload folder using huggingface_hub

c6e1dae verified 23 days ago

history blame contribute delete

1.02 kB

	model:
	name: qwen3_phase1_noising_ln
	pretrained_name_or_path: "Qwen/Qwen3-0.6B"
	n_enc: 14
	n_dec: 14
	block_size: 32
	task_prefix_length: 8
	enc_attn_type: causal
	latent_posid_mode: const
	latent_mode: append
	compression_ratios: [4, 8, 16]
	latent_noising_thresh: 0.5
	latent_noising_mode: uniform_rand
	bottleneck_dim: 256

	data:
	hf_name: HuggingFaceFW/fineweb-edu
	hf_subset: sample-100BT
	hf_split: train
	val_num_docs: 256
	max_doc_tokens: 4096
	shuffle_buffer_size: 50000
	num_workers: 4
	pin_memory: true

	train:
	exp_name: 100b_b32d3z256n05_append_const_ln
	per_device_batch_size: 256
	block_size: 32
	max_steps: 500000
	log_every: 100
	eval_every: 10000
	save_every: 50000
	lr: 5.0e-5
	weight_decay: 0.01
	betas: [0.9, 0.99]
	eps: 1.0e-6
	clip_grad_norm: 1.0
	warmup_steps: 1000
	seed: 42
	mixed_precision: bf16
	ema_decay: 0.9999

	checkpointing:
	out_dir: /scratch/hl3797/compress-ar/src_qwen3/checkpoints

	tokenizer:
	tokenizer_name_or_path: "Qwen/Qwen3-0.6B"