Add files using upload-large-folder tool
Browse files- resid_post_layer_14/trainer_0/ae.pt +3 -0
- resid_post_layer_14/trainer_0/config.json +53 -0
- resid_post_layer_14/trainer_0/eval_results.json +1 -0
- resid_post_layer_14/trainer_1/config.json +53 -0
- resid_post_layer_14/trainer_1/eval_results.json +1 -0
- resid_post_layer_14/trainer_2/config.json +53 -0
- resid_post_layer_14/trainer_2/eval_results.json +1 -0
- resid_post_layer_14/trainer_3/config.json +53 -0
- resid_post_layer_14/trainer_3/eval_results.json +1 -0
- resid_post_layer_21/trainer_0/ae.pt +3 -0
- resid_post_layer_21/trainer_0/checkpoints/ae_0.pt +3 -0
- resid_post_layer_21/trainer_0/checkpoints/ae_122.pt +3 -0
- resid_post_layer_21/trainer_0/checkpoints/ae_1220.pt +3 -0
- resid_post_layer_21/trainer_0/checkpoints/ae_12207.pt +3 -0
- resid_post_layer_21/trainer_0/checkpoints/ae_386.pt +3 -0
- resid_post_layer_21/trainer_0/checkpoints/ae_3860.pt +3 -0
- resid_post_layer_21/trainer_0/checkpoints/ae_38601.pt +3 -0
- resid_post_layer_21/trainer_0/config.json +53 -0
- resid_post_layer_21/trainer_0/eval_results.json +1 -0
- resid_post_layer_21/trainer_1/ae.pt +3 -0
- resid_post_layer_21/trainer_1/checkpoints/ae_0.pt +3 -0
- resid_post_layer_21/trainer_1/checkpoints/ae_122.pt +3 -0
- resid_post_layer_21/trainer_1/checkpoints/ae_1220.pt +3 -0
- resid_post_layer_21/trainer_1/checkpoints/ae_12207.pt +3 -0
- resid_post_layer_21/trainer_1/checkpoints/ae_3860.pt +3 -0
- resid_post_layer_21/trainer_1/checkpoints/ae_38601.pt +3 -0
- resid_post_layer_21/trainer_1/config.json +53 -0
- resid_post_layer_21/trainer_1/eval_results.json +1 -0
- resid_post_layer_21/trainer_2/ae.pt +3 -0
- resid_post_layer_21/trainer_2/checkpoints/ae_386.pt +3 -0
- resid_post_layer_21/trainer_2/config.json +53 -0
- resid_post_layer_21/trainer_2/eval_results.json +1 -0
- resid_post_layer_21/trainer_3/ae.pt +3 -0
- resid_post_layer_21/trainer_3/checkpoints/ae_0.pt +3 -0
- resid_post_layer_21/trainer_3/checkpoints/ae_12207.pt +3 -0
- resid_post_layer_21/trainer_3/checkpoints/ae_386.pt +3 -0
- resid_post_layer_21/trainer_3/config.json +53 -0
- resid_post_layer_21/trainer_3/eval_results.json +1 -0
- resid_post_layer_7/trainer_0/ae.pt +3 -0
- resid_post_layer_7/trainer_0/config.json +53 -0
- resid_post_layer_7/trainer_0/eval_results.json +1 -0
- resid_post_layer_7/trainer_1/ae.pt +3 -0
- resid_post_layer_7/trainer_1/config.json +53 -0
- resid_post_layer_7/trainer_1/eval_results.json +1 -0
- resid_post_layer_7/trainer_2/ae.pt +3 -0
- resid_post_layer_7/trainer_2/config.json +53 -0
- resid_post_layer_7/trainer_2/eval_results.json +1 -0
- resid_post_layer_7/trainer_3/ae.pt +3 -0
- resid_post_layer_7/trainer_3/config.json +53 -0
- resid_post_layer_7/trainer_3/eval_results.json +1 -0
resid_post_layer_14/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e972e49426cc4ad5b47e819305b259747544401e9eab036c195e5925a037589
|
| 3 |
+
size 335622413
|
resid_post_layer_14/trainer_0/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 16384,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
512,
|
| 32 |
+
1024,
|
| 33 |
+
2048,
|
| 34 |
+
4096,
|
| 35 |
+
8704
|
| 36 |
+
],
|
| 37 |
+
"k": 80,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 14,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_14_trainer_0",
|
| 42 |
+
"submodule_name": "resid_post_layer_14"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_14/trainer_0/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 33.73863636363637, "l1_loss": 367.6060606060606, "l0": 86.32287019671816, "frac_variance_explained": 0.7295217803030303, "cossim": 0.94140625, "l2_ratio": 0.949514678030303, "relative_reconstruction_bias": 1.01171875, "frac_alive": 0.99676513671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
resid_post_layer_14/trainer_1/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 16384,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
512,
|
| 32 |
+
1024,
|
| 33 |
+
2048,
|
| 34 |
+
4096,
|
| 35 |
+
8704
|
| 36 |
+
],
|
| 37 |
+
"k": 160,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 14,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_14_trainer_1",
|
| 42 |
+
"submodule_name": "resid_post_layer_14"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_14/trainer_1/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 30.748106060606062, "l1_loss": 656.2424242424242, "l0": 178.7910262599136, "frac_variance_explained": 0.775094696969697, "cossim": 0.952829071969697, "l2_ratio": 0.9609375, "relative_reconstruction_bias": 1.011600378787879, "frac_alive": 0.99554443359375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
resid_post_layer_14/trainer_2/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 80,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 14,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_14_trainer_2",
|
| 42 |
+
"submodule_name": "resid_post_layer_14"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_14/trainer_2/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 31.926136363636363, "l1_loss": 346.90909090909093, "l0": 87.18260990489613, "frac_variance_explained": 0.7577533143939394, "cossim": 0.9484493371212122, "l2_ratio": 0.9568536931818182, "relative_reconstruction_bias": 1.0106534090909092, "frac_alive": 0.83673095703125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
resid_post_layer_14/trainer_3/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 160,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 14,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_14_trainer_3",
|
| 42 |
+
"submodule_name": "resid_post_layer_14"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_14/trainer_3/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 28.75, "l1_loss": 598.8484848484849, "l0": 179.98043915719697, "frac_variance_explained": 0.8037405303030303, "cossim": 0.95703125, "l2_ratio": 0.96484375, "relative_reconstruction_bias": 1.0095880681818181, "frac_alive": 0.770263671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
resid_post_layer_21/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:165e90ed2f6253da543c03a24825726264d22e8ad92176ec4ad4dbdb3945270a
|
| 3 |
+
size 335622413
|
resid_post_layer_21/trainer_0/checkpoints/ae_0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d5c244af1bb6fdc5168d5a0fba14e47f0d7966ef4d7cec1675ee697750f66f27
|
| 3 |
+
size 335622563
|
resid_post_layer_21/trainer_0/checkpoints/ae_122.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:945e387769f7beda9be7b0c9cdd696b2e2abefecece8392a1b4c2e80084442bf
|
| 3 |
+
size 335622585
|
resid_post_layer_21/trainer_0/checkpoints/ae_1220.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39e6bc706f6b8e964840095580947f17a6bcede000021a3c33cf2aa3cee1936e
|
| 3 |
+
size 335622596
|
resid_post_layer_21/trainer_0/checkpoints/ae_12207.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5c625417e79276c56a1f143346a90a28a0435662bcb997db3b56bebbc398265
|
| 3 |
+
size 335622863
|
resid_post_layer_21/trainer_0/checkpoints/ae_386.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7a2b19e91d0bd6e2289554404ee0fb86d26fc506de8b4e8a2ec3a3e7d79c9f1
|
| 3 |
+
size 335622585
|
resid_post_layer_21/trainer_0/checkpoints/ae_3860.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:391cd1f61d02cec575cfbdef45413d669a62933edec176af68033e87692dd893
|
| 3 |
+
size 335622596
|
resid_post_layer_21/trainer_0/checkpoints/ae_38601.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8b619b9fd567aedb2aa09fb46afcf1a641e6220be5d2deb316ebe807ddf1dfd
|
| 3 |
+
size 335622863
|
resid_post_layer_21/trainer_0/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 16384,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
512,
|
| 32 |
+
1024,
|
| 33 |
+
2048,
|
| 34 |
+
4096,
|
| 35 |
+
8704
|
| 36 |
+
],
|
| 37 |
+
"k": 80,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 21,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_21_trainer_0",
|
| 42 |
+
"submodule_name": "resid_post_layer_21"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_21/trainer_0/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 29.035984848484848, "l1_loss": 389.1212121212121, "l0": 86.1897815357555, "frac_variance_explained": 0.8267045454545454, "cossim": 0.9356060606060606, "l2_ratio": 0.9453125, "relative_reconstruction_bias": 1.0080492424242424, "frac_alive": 0.8565673828125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
resid_post_layer_21/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10a23c9b3a00ddd46fb78590c3d00ea4c61ef652f192a73f741b9c590251d5b1
|
| 3 |
+
size 335622413
|
resid_post_layer_21/trainer_1/checkpoints/ae_0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:691f9ddf883175ebdcf92281a5788e1d56f11403bfd15152a0d8fa77ecf2ab1c
|
| 3 |
+
size 335622563
|
resid_post_layer_21/trainer_1/checkpoints/ae_122.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:249fb4fd697ed16d6fc7899de500c9ffedc26d111999ec7cb55a1bd4f7a6967c
|
| 3 |
+
size 335622585
|
resid_post_layer_21/trainer_1/checkpoints/ae_1220.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c82d644ef2719a67149ff1a159f8ad31044dcbcd85b7f6cb541e56bfc5d2d2b8
|
| 3 |
+
size 335622596
|
resid_post_layer_21/trainer_1/checkpoints/ae_12207.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3c433a364c0e7c8971a486bd3128c869bb4b2195dbb95fea4d536b0a73dc324
|
| 3 |
+
size 335622863
|
resid_post_layer_21/trainer_1/checkpoints/ae_3860.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1120734e71d54fb0babac9283086f3f8b898adfb9944a023fb75017d453b4190
|
| 3 |
+
size 335622596
|
resid_post_layer_21/trainer_1/checkpoints/ae_38601.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3b816102b2ae8f3d93b3f98fb1bf0f5c40c9b80c838df83c1c0df35492b2ebc
|
| 3 |
+
size 335622863
|
resid_post_layer_21/trainer_1/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 16384,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
512,
|
| 32 |
+
1024,
|
| 33 |
+
2048,
|
| 34 |
+
4096,
|
| 35 |
+
8704
|
| 36 |
+
],
|
| 37 |
+
"k": 160,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 21,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_21_trainer_1",
|
| 42 |
+
"submodule_name": "resid_post_layer_21"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_21/trainer_1/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 25.890151515151516, "l1_loss": 713.7575757575758, "l0": 172.76931184710878, "frac_variance_explained": 0.8618016098484849, "cossim": 0.94921875, "l2_ratio": 0.95703125, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.82403564453125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
resid_post_layer_21/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4dd3747bb0722cf29b80d44490af87fcebe5b400c9d4e855a9c67eb3b4bf9888
|
| 3 |
+
size 1342451981
|
resid_post_layer_21/trainer_2/checkpoints/ae_386.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93c79e12e7bc1bccdb7033bc45f1e92f6f3f607c86ac462002106c8318347b7f
|
| 3 |
+
size 1342452153
|
resid_post_layer_21/trainer_2/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 80,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 21,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_21_trainer_2",
|
| 42 |
+
"submodule_name": "resid_post_layer_21"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_21/trainer_2/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 27.90340909090909, "l1_loss": 354.72727272727275, "l0": 86.3529703544848, "frac_variance_explained": 0.8391335227272727, "cossim": 0.94140625, "l2_ratio": 0.9491595643939394, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.7160797119140625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
resid_post_layer_21/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0823518391306a0c4b7854174b77165f825a8c4bc0244b554a4b56df0395dd06
|
| 3 |
+
size 1342451981
|
resid_post_layer_21/trainer_3/checkpoints/ae_0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:125f43f916a01a1c9fd3ae8bd241c185076396658210163cfa52396424584ccd
|
| 3 |
+
size 1342452131
|
resid_post_layer_21/trainer_3/checkpoints/ae_12207.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94eadcff4ef5be95da7aae55a5c422da6108b63161e37bd828159e2c12b6cc3c
|
| 3 |
+
size 1342452431
|
resid_post_layer_21/trainer_3/checkpoints/ae_386.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fad19132d5b66d0c5359ee4a5bcf115726d4a4775034c346c428a8b03dc2a074
|
| 3 |
+
size 1342452153
|
resid_post_layer_21/trainer_3/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 160,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 21,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_21_trainer_3",
|
| 42 |
+
"submodule_name": "resid_post_layer_21"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_21/trainer_3/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 24.477272727272727, "l1_loss": 605.030303030303, "l0": 172.87360659512606, "frac_variance_explained": 0.8756510416666666, "cossim": 0.9533617424242424, "l2_ratio": 0.9609375, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.713165283203125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
resid_post_layer_7/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:33d9a3668cb511c09f22665184e6558d6e8504501d3f4f12b2e96486a51cac5f
|
| 3 |
+
size 335622413
|
resid_post_layer_7/trainer_0/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 16384,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
512,
|
| 32 |
+
1024,
|
| 33 |
+
2048,
|
| 34 |
+
4096,
|
| 35 |
+
8704
|
| 36 |
+
],
|
| 37 |
+
"k": 80,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 7,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_7_trainer_0",
|
| 42 |
+
"submodule_name": "resid_post_layer_7"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_7/trainer_0/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 24.009469696969695, "l1_loss": 307.54545454545456, "l0": 92.451904296875, "frac_variance_explained": 0.7856889204545454, "cossim": 0.9562618371212122, "l2_ratio": 0.96484375, "relative_reconstruction_bias": 1.007930871212121, "frac_alive": 0.98370361328125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
resid_post_layer_7/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21984cbf97693658675bddcdaec65426aa20d49bd7926914339f13f5e53d2827
|
| 3 |
+
size 335622413
|
resid_post_layer_7/trainer_1/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 16384,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
512,
|
| 32 |
+
1024,
|
| 33 |
+
2048,
|
| 34 |
+
4096,
|
| 35 |
+
8704
|
| 36 |
+
],
|
| 37 |
+
"k": 160,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 7,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_7_trainer_1",
|
| 42 |
+
"submodule_name": "resid_post_layer_7"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_7/trainer_1/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 21.954545454545453, "l1_loss": 509.8181818181818, "l0": 185.27754905007103, "frac_variance_explained": 0.8208451704545454, "cossim": 0.9637784090909091, "l2_ratio": 0.97265625, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.9376220703125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
resid_post_layer_7/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:89b5ea3fe385ce68e9e80924aa94b1f17f248bf0f5aa76e58c491fdd5d82a289
|
| 3 |
+
size 1342451981
|
resid_post_layer_7/trainer_2/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 80,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 7,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_7_trainer_2",
|
| 42 |
+
"submodule_name": "resid_post_layer_7"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_7/trainer_2/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 22.15530303030303, "l1_loss": 281.45454545454544, "l0": 93.40012440536961, "frac_variance_explained": 0.8163470643939394, "cossim": 0.9609375, "l2_ratio": 0.96875, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.830413818359375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
resid_post_layer_7/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd7c4889199febcdc335303bd392d5e416345715f6535f462bbd85db7ef69366
|
| 3 |
+
size 1342451981
|
resid_post_layer_7/trainer_3/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 5e-05,
|
| 6 |
+
"steps": 122070,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 97656,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1280,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 2560,
|
| 15 |
+
"dict_size": 65536,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
2048,
|
| 32 |
+
4096,
|
| 33 |
+
8192,
|
| 34 |
+
16384,
|
| 35 |
+
34816
|
| 36 |
+
],
|
| 37 |
+
"k": 160,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 7,
|
| 40 |
+
"lm_name": "google/gemma-4-E4B",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_7_trainer_3",
|
| 42 |
+
"submodule_name": "resid_post_layer_7"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 2560,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 122,
|
| 48 |
+
"ctx_len": 2048,
|
| 49 |
+
"refresh_batch_size": 16,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
resid_post_layer_7/trainer_3/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 19.914772727272727, "l1_loss": 427.3939393939394, "l0": 188.28258190733013, "frac_variance_explained": 0.8515625, "cossim": 0.96875, "l2_ratio": 0.9765625, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.8144989013671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|