| model: |
| n_layer: 2 |
| d_model: 128 |
| n_ctx: 512 |
| d_head: 16 |
| d_mlp: 512 |
| vocab_size: 4096 |
| use_rms_norm: true |
| tie_embeddings: false |
| use_positional_embeddings: false |
| use_bigram_table: false |
| use_attention_sinks: true |
| activation: gelu |
| dropout: 0.0 |
| use_bias: true |
| use_flash_attention: true |
| sparsity: |
| enable_weight_sparsity: false |
| target_l0_fraction: 1 |
| sparsity_anneal_start_fraction: 0.01 |
| sparsity_anneal_end_fraction: 0.5 |
| anneal_type: linear |
| min_weights_per_neuron: 4 |
| enable_activation_sparsity: false |
| activation_topk_fraction: 0.25 |
| activation_sparsity_locations: attn_in,attn_out,mlp_in,mlp_out,mlp_neuron,attn_v,attn_k,attn_q |
| optimizer: |
| optimizer_type: adamw |
| learning_rate: 0.01 |
| beta1: 0.9 |
| beta2: 0.95 |
| weight_decay: 0.1 |
| eps: 1.0e-06 |
| enable_grad_clip: true |
| grad_clip_rms: 1.0 |
| warmup_fraction: 0.01 |
| enable_lr_decay: true |
| use_sharkfin_schedule: false |
| training: |
| dataset_name: SimpleStories/SimpleStories |
| dataset_split: train |
| text_column: story |
| tokenizer_name: SimpleStories/SimpleStories-1.25M |
| total_tokens: 2000000000 |
| batch_size: 128 |
| gradient_accumulation_steps: 1 |
| mixed_precision: bf16 |
| checkpoint_dir: checkpoints |
| checkpoint_every_n_steps: 100000000 |
| keep_n_checkpoints: 5 |
| log_every_n_steps: 10 |
| log_gradients_every_n_steps: 10 |
| log_weights_every_n_steps: 100 |
| log_sparsity_every_n_steps: 100 |
| eval_every_n_steps: 20 |
| val_split: test |
| val_holdout_fraction: 0.01 |
| val_max_batches: 20 |
| wandb_project: my_sparsity |
| wandb_run_name: d128_f1 |
| wandb_entity: null |
| use_wandb: true |
| seed: 0 |
| hf_repo: jacobcd52/ss_d128_f1 |
|
|