ss_d128_f1 / training_config.yaml
jacobcd52's picture
Upload training_config.yaml with huggingface_hub
fa93af8 verified
model:
n_layer: 2
d_model: 128
n_ctx: 512
d_head: 16
d_mlp: 512
vocab_size: 4096
use_rms_norm: true
tie_embeddings: false
use_positional_embeddings: false
use_bigram_table: false
use_attention_sinks: true
activation: gelu
dropout: 0.0
use_bias: true
use_flash_attention: true
sparsity:
enable_weight_sparsity: false
target_l0_fraction: 1
sparsity_anneal_start_fraction: 0.01
sparsity_anneal_end_fraction: 0.5
anneal_type: linear
min_weights_per_neuron: 4
enable_activation_sparsity: false
activation_topk_fraction: 0.25
activation_sparsity_locations: attn_in,attn_out,mlp_in,mlp_out,mlp_neuron,attn_v,attn_k,attn_q
optimizer:
optimizer_type: adamw
learning_rate: 0.01
beta1: 0.9
beta2: 0.95
weight_decay: 0.1
eps: 1.0e-06
enable_grad_clip: true
grad_clip_rms: 1.0
warmup_fraction: 0.01
enable_lr_decay: true
use_sharkfin_schedule: false
training:
dataset_name: SimpleStories/SimpleStories
dataset_split: train
text_column: story
tokenizer_name: SimpleStories/SimpleStories-1.25M
total_tokens: 2000000000
batch_size: 128
gradient_accumulation_steps: 1
mixed_precision: bf16
checkpoint_dir: checkpoints
checkpoint_every_n_steps: 100000000
keep_n_checkpoints: 5
log_every_n_steps: 10
log_gradients_every_n_steps: 10
log_weights_every_n_steps: 100
log_sparsity_every_n_steps: 100
eval_every_n_steps: 20
val_split: test
val_holdout_fraction: 0.01
val_max_batches: 20
wandb_project: my_sparsity
wandb_run_name: d128_f1
wandb_entity: null
use_wandb: true
seed: 0
hf_repo: jacobcd52/ss_d128_f1