Upload training_config.yaml with huggingface_hub
Browse files- training_config.yaml +7 -1
training_config.yaml
CHANGED
|
@@ -18,7 +18,13 @@ sparse_model:
|
|
| 18 |
use_bias: true
|
| 19 |
use_flash_attention: true
|
| 20 |
bridges:
|
|
|
|
| 21 |
encoder_afrac: 0.25
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
coef_nmse: 1.0
|
| 23 |
coef_kl_d2s: 1.0
|
| 24 |
coef_kl_s2d: 1.0
|
|
@@ -33,7 +39,7 @@ sparsity:
|
|
| 33 |
min_weights_per_neuron: 4
|
| 34 |
enable_activation_sparsity: true
|
| 35 |
activation_topk_fraction: 0.25
|
| 36 |
-
activation_sparsity_locations: attn_in,attn_out,mlp_in,mlp_out,mlp_neuron,attn_v,attn_k,attn_q
|
| 37 |
optimizer:
|
| 38 |
optimizer_type: adamw
|
| 39 |
learning_rate: 0.001
|
|
|
|
| 18 |
use_bias: true
|
| 19 |
use_flash_attention: true
|
| 20 |
bridges:
|
| 21 |
+
bridge_act_fn: abstopk
|
| 22 |
encoder_afrac: 0.25
|
| 23 |
+
threshold_sharpness_init: 1.0
|
| 24 |
+
threshold_sharpness_final: 100.0
|
| 25 |
+
threshold_anneal_start_fraction: 0.0
|
| 26 |
+
threshold_anneal_end_fraction: 0.5
|
| 27 |
+
threshold_init_log_eps: -1.0
|
| 28 |
coef_nmse: 1.0
|
| 29 |
coef_kl_d2s: 1.0
|
| 30 |
coef_kl_s2d: 1.0
|
|
|
|
| 39 |
min_weights_per_neuron: 4
|
| 40 |
enable_activation_sparsity: true
|
| 41 |
activation_topk_fraction: 0.25
|
| 42 |
+
activation_sparsity_locations: attn_in,attn_out,mlp_in,mlp_out,mlp_neuron,attn_v,attn_k,attn_q,resid_mid,resid_pre
|
| 43 |
optimizer:
|
| 44 |
optimizer_type: adamw
|
| 45 |
learning_rate: 0.001
|