Narmeen07 commited on
Commit
a3f9d16
·
verified ·
1 Parent(s): 10504ca

Upload folder using huggingface_hub

Browse files
Files changed (43) hide show
  1. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/ae.pt +3 -0
  2. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/config.json +32 -0
  3. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/eval_results.json +1 -0
  4. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.10_d65536_lr3.00e-04/ae.pt +3 -0
  5. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.10_d65536_lr3.00e-04/config.json +37 -0
  6. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.25_d65536_lr3.00e-04/ae.pt +3 -0
  7. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.25_d65536_lr3.00e-04/config.json +37 -0
  8. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.50_d65536_lr3.00e-04/ae.pt +3 -0
  9. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.50_d65536_lr3.00e-04/config.json +37 -0
  10. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.75_d65536_lr3.00e-04/ae.pt +3 -0
  11. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.75_d65536_lr3.00e-04/config.json +37 -0
  12. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l2.00_d65536_lr3.00e-04/ae.pt +3 -0
  13. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l2.00_d65536_lr3.00e-04/config.json +37 -0
  14. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.10_d65536_lr3.00e-04/ae.pt +3 -0
  15. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.10_d65536_lr3.00e-04/config.json +37 -0
  16. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.25_d65536_lr3.00e-04/ae.pt +3 -0
  17. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.25_d65536_lr3.00e-04/config.json +37 -0
  18. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.50_d65536_lr3.00e-04/ae.pt +3 -0
  19. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.50_d65536_lr3.00e-04/config.json +37 -0
  20. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.75_d65536_lr3.00e-04/ae.pt +3 -0
  21. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.75_d65536_lr3.00e-04/config.json +37 -0
  22. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l2.00_d65536_lr3.00e-04/ae.pt +3 -0
  23. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l2.00_d65536_lr3.00e-04/config.json +37 -0
  24. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.10_d65536_lr3.00e-04/ae.pt +3 -0
  25. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.10_d65536_lr3.00e-04/config.json +37 -0
  26. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.25_d65536_lr3.00e-04/ae.pt +3 -0
  27. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.25_d65536_lr3.00e-04/config.json +37 -0
  28. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.50_d65536_lr3.00e-04/ae.pt +3 -0
  29. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.50_d65536_lr3.00e-04/config.json +37 -0
  30. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.75_d65536_lr3.00e-04/ae.pt +3 -0
  31. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.75_d65536_lr3.00e-04/config.json +37 -0
  32. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l2.00_d65536_lr3.00e-04/ae.pt +3 -0
  33. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l2.00_d65536_lr3.00e-04/config.json +37 -0
  34. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.10_d65536_lr3.00e-04/ae.pt +3 -0
  35. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.10_d65536_lr3.00e-04/config.json +37 -0
  36. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.25_d65536_lr3.00e-04/ae.pt +3 -0
  37. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.25_d65536_lr3.00e-04/config.json +37 -0
  38. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.50_d65536_lr3.00e-04/ae.pt +3 -0
  39. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.50_d65536_lr3.00e-04/config.json +37 -0
  40. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.75_d65536_lr3.00e-04/ae.pt +3 -0
  41. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.75_d65536_lr3.00e-04/config.json +37 -0
  42. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l2.00_d65536_lr3.00e-04/ae.pt +3 -0
  43. smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l2.00_d65536_lr3.00e-04/config.json +37 -0
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b45d8b70a80988f330593a37a19c39e398c7033407b3e83d45b7065d72425556
3
+ size 302256406
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 288,
13
+ "seed": 0,
14
+ "activation_dim": 576,
15
+ "dict_size": 65536,
16
+ "k": 60,
17
+ "device": "cuda:0",
18
+ "layer": 12,
19
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
20
+ "wandb_name": "BatchTopKTrainer-HuggingFaceTB/SmolLM-135M-resid_post_layer_12_trainer_0",
21
+ "submodule_name": "resid_post_layer_12"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 576,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 64,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda:0"
31
+ }
32
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/BatchTopKTrainer-HuggingFaceTB_SmolLM-135M-resid_post_layer_12_trainer_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 92.07174682617188, "l1_loss": 3708.7429962158203, "l0": 64.7215793132782, "frac_variance_explained": 0.9954343065619469, "cossim": 0.9156519845128059, "l2_ratio": 0.9147232957184315, "relative_reconstruction_bias": 1.004285804927349, "loss_original": 2.5805100798606873, "loss_reconstructed": 4.020263820886612, "loss_zero": 10.802701473236084, "frac_recovered": 0.8246423006057739, "frac_alive": 0.0301055908203125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.10_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc32cfecdde3cb737b5d67fb89288b94bc997364c6684283e1dcdfb8c104c451
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.10_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "entropy",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_entropy_k60_l1.10_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.1
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.25_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5c149a2d257a27c80b240a25d38e459eb6df735d55e22ebb3339d37cd3255e7
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.25_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "entropy",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_entropy_k60_l1.25_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.25
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.50_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3deacaca93feb7a03f4c8d9de787b677a2c148e50fc9a13ec416af992c116892
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.50_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "entropy",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_entropy_k60_l1.50_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.5
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.75_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34054c415ee224e4ed9692138c0a7a91861811d6378a72b63a1f6f8f63f6bb2b
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l1.75_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "entropy",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_entropy_k60_l1.75_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.75
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l2.00_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5ca1d5ecd1d322415b3fd8648fc715ea9401d01dc09bd3ccc6e11d4a0059aa9
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_entropy_k60_l2.00_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "entropy",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_entropy_k60_l2.00_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 2.0
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.10_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51cfcf00e77e9443b20158882b49ee279996ce9ac9918269dc3a1cb0c61aee0e
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.10_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "l2_norm",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_l2_norm_k60_l1.10_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.1
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.25_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a8468809fa5ff7656bf59bb9b478f3142192308351d230c1af83dde886b6623
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.25_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "l2_norm",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_l2_norm_k60_l1.25_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.25
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.50_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08c9cd7925421f014c2e44cdac8f44227eb77543bca330684ac4eec11cbac564
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.50_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "l2_norm",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_l2_norm_k60_l1.50_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.5
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.75_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b375cc6c76a59af461fe86847c8a93d4fa16a43504ed3f25245ad7e5fc5a23e
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l1.75_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "l2_norm",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_l2_norm_k60_l1.75_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.75
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l2.00_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fdbe6e8cdcefce5c1f9879b07b1b1f09530ce78f9e6d929b99328c3ecafe57b
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_l2_norm_k60_l2.00_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "l2_norm",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_l2_norm_k60_l2.00_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 2.0
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.10_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e923a59511c661d6afd6fae921ef74a008f9713d49e9eab521cc8b01173169e
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.10_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "leverage",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_leverage_k60_l1.10_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.1
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.25_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:428f3b6f988da3206816427c7c01fdb2e82a9adedc7542d1dbeef2edd427357c
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.25_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "leverage",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_leverage_k60_l1.25_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.25
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.50_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d93d822910f5af9965db82e52a938ae82a9728b5444da2755b7793e7e3752b5f
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.50_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "leverage",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_leverage_k60_l1.50_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.5
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.75_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d98e381efe5bd09e30a43fd443170abede01a9acd0db8db90db839cc6138ac10
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l1.75_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "leverage",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_leverage_k60_l1.75_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.75
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l2.00_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:952011a26432c9550ae672eb50ec3b123b72c9cc9eaca9308f6e3e60ddd8dbfd
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_leverage_k60_l2.00_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "leverage",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_leverage_k60_l2.00_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 2.0
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.10_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc88a820cfe9e0adf54048d73e939fd2c7fa2db8d08ccb745e8c304351afb40
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.10_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "uniform",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_uniform_k60_l1.10_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.1
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.25_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2abf82cb3f2604707fd4958e8052ee788f2246a7a4a60bacc5e73d2fc80af7e
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.25_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "uniform",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_uniform_k60_l1.25_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.25
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.50_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b81c8625cac730026a475c2f588c322226f9483cf6eecf9581c93e38a157bf42
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.50_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "uniform",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_uniform_k60_l1.50_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.5
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.75_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ba059a827616a557cbb92a91e940245797a92cab1957f9d5c5e6b0239ca8118
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l1.75_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "uniform",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_uniform_k60_l1.75_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 1.75
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l2.00_d65536_lr3.00e-04/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d2046cda7c40485203d89e9c09171ae7ac6a54f4207cb6214bed6c433dc4410
3
+ size 302519834
smol135m_hybrid_bundle_HuggingFaceTB_SmolLM-135M_batch_top_k_hybrid_sampled_top_k/resid_post_layer_12/HybridSampledSAE_uniform_k60_l2.00_d65536_lr3.00e-04/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "HybridSampledTopKTrainer",
4
+ "dict_class": "HybridSampledTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 12207,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 9765,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "sampling_update_freq": 1,
13
+ "sampling_method": "uniform",
14
+ "ridge_lambda": 0.01,
15
+ "sketching_size": 100,
16
+ "top_k_aux": 288,
17
+ "seed": 0,
18
+ "activation_dim": 576,
19
+ "dict_size": 65536,
20
+ "k": 60,
21
+ "device": "cuda:0",
22
+ "layer": 12,
23
+ "lm_name": "HuggingFaceTB/SmolLM-135M",
24
+ "wandb_name": "HybridSampledSAE_uniform_k60_l2.00_d65536_lr3.00e-04",
25
+ "submodule_name": "resid_post_layer_12",
26
+ "l_multiplier": 2.0
27
+ },
28
+ "buffer": {
29
+ "d_submodule": 576,
30
+ "io": "out",
31
+ "n_ctxs": 244,
32
+ "ctx_len": 1024,
33
+ "refresh_batch_size": 64,
34
+ "out_batch_size": 2048,
35
+ "device": "cuda:0"
36
+ }
37
+ }