jacobcd52 commited on
Commit
977d97b
·
verified ·
1 Parent(s): d5a4eaa

Upload training_config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. training_config.yaml +62 -0
training_config.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ n_layer: 4
3
+ d_model: 4096
4
+ n_ctx: 512
5
+ d_head: 16
6
+ d_mlp: 16384
7
+ vocab_size: 4096
8
+ use_rms_norm: true
9
+ tie_embeddings: false
10
+ use_positional_embeddings: false
11
+ use_bigram_table: false
12
+ use_attention_sinks: true
13
+ activation: gelu
14
+ dropout: 0.0
15
+ use_bias: true
16
+ use_flash_attention: true
17
+ sparsity:
18
+ enable_weight_sparsity: true
19
+ target_l0_fraction: 0.0039
20
+ sparsity_anneal_start_fraction: 0.01
21
+ sparsity_anneal_end_fraction: 0.5
22
+ min_weights_per_neuron: 4
23
+ enable_activation_sparsity: true
24
+ activation_topk_fraction: 0.25
25
+ activation_sparsity_locations: attn_in,attn_out,mlp_in,mlp_out,mlp_neuron,attn_v,attn_k,attn_q
26
+ optimizer:
27
+ learning_rate: 0.001
28
+ beta1: 0.9
29
+ beta2: 0.95
30
+ weight_decay: 0.1
31
+ eps: 0.1
32
+ enable_grad_clip: true
33
+ grad_clip_rms: 1.0
34
+ warmup_fraction: 0.01
35
+ enable_lr_decay: true
36
+ use_sharkfin_schedule: false
37
+ training:
38
+ dataset_name: SimpleStories/SimpleStories
39
+ dataset_split: train
40
+ text_column: story
41
+ tokenizer_name: SimpleStories/SimpleStories-1.25M
42
+ total_tokens: 2000000000
43
+ batch_size: 128
44
+ gradient_accumulation_steps: 1
45
+ mixed_precision: bf16
46
+ checkpoint_dir: checkpoints
47
+ checkpoint_every_n_steps: 1000
48
+ keep_n_checkpoints: 5
49
+ log_every_n_steps: 10
50
+ log_gradients_every_n_steps: 100
51
+ log_weights_every_n_steps: 100
52
+ log_sparsity_every_n_steps: 100
53
+ eval_every_n_steps: 20
54
+ val_split: test
55
+ val_holdout_fraction: 0.01
56
+ val_max_batches: 20
57
+ wandb_project: my_sparsity
58
+ wandb_run_name: d4096_f0.0039
59
+ wandb_entity: null
60
+ use_wandb: true
61
+ seed: 0
62
+ hf_repo: jacobcd52/ss_d4096_f0.0039