Upload folder using huggingface_hub
Browse files- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/config.json +32 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/eval_results.json +1 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.10_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.10_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.25_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.25_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.50_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.50_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.75_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.75_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l2.00_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l2.00_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.10_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.10_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.25_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.25_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.50_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.50_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.75_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.75_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l2.00_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l2.00_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.10_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.10_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.25_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.25_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.50_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.50_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.75_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.75_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l2.00_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l2.00_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.10_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.10_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.25_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.25_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.50_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.50_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.75_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.75_d65536_lr3.00e-04/config.json +37 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l2.00_d65536_lr3.00e-04/ae.pt +3 -0
- smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l2.00_d65536_lr3.00e-04/config.json +37 -0
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b45d8b70a80988f330593a37a19c39e398c7033407b3e83d45b7065d72425556
|
| 3 |
+
size 302256406
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 288,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 576,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"k": 60,
|
| 17 |
+
"device": "cuda:0",
|
| 18 |
+
"layer": 12,
|
| 19 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-HuggingFaceTB/SmolLM-135M-resid_post_layer_12_trainer_0",
|
| 21 |
+
"submodule_name": "resid_post_layer_12"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 576,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 244,
|
| 27 |
+
"ctx_len": 1024,
|
| 28 |
+
"refresh_batch_size": 64,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:0"
|
| 31 |
+
}
|
| 32 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 92.07174682617188, "l1_loss": 3708.7429962158203, "l0": 64.7215793132782, "frac_variance_explained": 0.9954343065619469, "cossim": 0.9156519845128059, "l2_ratio": 0.9147232957184315, "relative_reconstruction_bias": 1.004285804927349, "loss_original": 2.5805100798606873, "loss_reconstructed": 4.020263820886612, "loss_zero": 10.802701473236084, "frac_recovered": 0.8246423006057739, "frac_alive": 0.0301055908203125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.10_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc32cfecdde3cb737b5d67fb89288b94bc997364c6684283e1dcdfb8c104c451
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.10_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "entropy",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_entropy_k60_l1.10_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.1
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.25_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d5c149a2d257a27c80b240a25d38e459eb6df735d55e22ebb3339d37cd3255e7
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.25_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "entropy",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_entropy_k60_l1.25_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.25
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.50_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3deacaca93feb7a03f4c8d9de787b677a2c148e50fc9a13ec416af992c116892
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.50_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "entropy",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_entropy_k60_l1.50_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.5
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.75_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34054c415ee224e4ed9692138c0a7a91861811d6378a72b63a1f6f8f63f6bb2b
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.75_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "entropy",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_entropy_k60_l1.75_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.75
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l2.00_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5ca1d5ecd1d322415b3fd8648fc715ea9401d01dc09bd3ccc6e11d4a0059aa9
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l2.00_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "entropy",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_entropy_k60_l2.00_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 2.0
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.10_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51cfcf00e77e9443b20158882b49ee279996ce9ac9918269dc3a1cb0c61aee0e
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.10_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "l2_norm",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_l2_norm_k60_l1.10_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.1
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.25_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a8468809fa5ff7656bf59bb9b478f3142192308351d230c1af83dde886b6623
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.25_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "l2_norm",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_l2_norm_k60_l1.25_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.25
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.50_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:08c9cd7925421f014c2e44cdac8f44227eb77543bca330684ac4eec11cbac564
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.50_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "l2_norm",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_l2_norm_k60_l1.50_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.5
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.75_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b375cc6c76a59af461fe86847c8a93d4fa16a43504ed3f25245ad7e5fc5a23e
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.75_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "l2_norm",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_l2_norm_k60_l1.75_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.75
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l2.00_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fdbe6e8cdcefce5c1f9879b07b1b1f09530ce78f9e6d929b99328c3ecafe57b
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l2.00_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "l2_norm",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_l2_norm_k60_l2.00_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 2.0
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.10_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e923a59511c661d6afd6fae921ef74a008f9713d49e9eab521cc8b01173169e
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.10_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "leverage",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_leverage_k60_l1.10_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.1
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.25_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:428f3b6f988da3206816427c7c01fdb2e82a9adedc7542d1dbeef2edd427357c
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.25_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "leverage",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_leverage_k60_l1.25_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.25
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.50_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d93d822910f5af9965db82e52a938ae82a9728b5444da2755b7793e7e3752b5f
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.50_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "leverage",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_leverage_k60_l1.50_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.5
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.75_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d98e381efe5bd09e30a43fd443170abede01a9acd0db8db90db839cc6138ac10
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.75_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "leverage",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_leverage_k60_l1.75_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.75
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l2.00_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:952011a26432c9550ae672eb50ec3b123b72c9cc9eaca9308f6e3e60ddd8dbfd
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l2.00_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "leverage",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_leverage_k60_l2.00_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 2.0
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.10_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0bc88a820cfe9e0adf54048d73e939fd2c7fa2db8d08ccb745e8c304351afb40
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.10_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "uniform",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_uniform_k60_l1.10_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.1
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.25_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2abf82cb3f2604707fd4958e8052ee788f2246a7a4a60bacc5e73d2fc80af7e
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.25_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "uniform",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_uniform_k60_l1.25_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.25
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.50_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b81c8625cac730026a475c2f588c322226f9483cf6eecf9581c93e38a157bf42
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.50_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "uniform",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_uniform_k60_l1.50_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.5
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.75_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ba059a827616a557cbb92a91e940245797a92cab1957f9d5c5e6b0239ca8118
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.75_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "uniform",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_uniform_k60_l1.75_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 1.75
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l2.00_d65536_lr3.00e-04/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d2046cda7c40485203d89e9c09171ae7ac6a54f4207cb6214bed6c433dc4410
|
| 3 |
+
size 302519834
|
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l2.00_d65536_lr3.00e-04/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "HybridSampledTopKTrainer",
|
| 4 |
+
"dict_class": "HybridSampledTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 12207,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 9765,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"sampling_update_freq": 1,
|
| 13 |
+
"sampling_method": "uniform",
|
| 14 |
+
"ridge_lambda": 0.01,
|
| 15 |
+
"sketching_size": 100,
|
| 16 |
+
"top_k_aux": 288,
|
| 17 |
+
"seed": 0,
|
| 18 |
+
"activation_dim": 576,
|
| 19 |
+
"dict_size": 65536,
|
| 20 |
+
"k": 60,
|
| 21 |
+
"device": "cuda:0",
|
| 22 |
+
"layer": 12,
|
| 23 |
+
"lm_name": "HuggingFaceTB/SmolLM-135M",
|
| 24 |
+
"wandb_name": "HybridSampledSAE_uniform_k60_l2.00_d65536_lr3.00e-04",
|
| 25 |
+
"submodule_name": "resid_post_layer_12",
|
| 26 |
+
"l_multiplier": 2.0
|
| 27 |
+
},
|
| 28 |
+
"buffer": {
|
| 29 |
+
"d_submodule": 576,
|
| 30 |
+
"io": "out",
|
| 31 |
+
"n_ctxs": 244,
|
| 32 |
+
"ctx_len": 1024,
|
| 33 |
+
"refresh_batch_size": 64,
|
| 34 |
+
"out_batch_size": 2048,
|
| 35 |
+
"device": "cuda:0"
|
| 36 |
+
}
|
| 37 |
+
}
|