michaelwaves commited on
Commit
b9e1f68
·
verified ·
1 Parent(s): 1f792da

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. resid_post_layer_14/trainer_0/ae.pt +3 -0
  2. resid_post_layer_14/trainer_0/config.json +53 -0
  3. resid_post_layer_14/trainer_0/eval_results.json +1 -0
  4. resid_post_layer_14/trainer_1/config.json +53 -0
  5. resid_post_layer_14/trainer_1/eval_results.json +1 -0
  6. resid_post_layer_14/trainer_2/config.json +53 -0
  7. resid_post_layer_14/trainer_2/eval_results.json +1 -0
  8. resid_post_layer_14/trainer_3/config.json +53 -0
  9. resid_post_layer_14/trainer_3/eval_results.json +1 -0
  10. resid_post_layer_21/trainer_0/ae.pt +3 -0
  11. resid_post_layer_21/trainer_0/checkpoints/ae_0.pt +3 -0
  12. resid_post_layer_21/trainer_0/checkpoints/ae_122.pt +3 -0
  13. resid_post_layer_21/trainer_0/checkpoints/ae_1220.pt +3 -0
  14. resid_post_layer_21/trainer_0/checkpoints/ae_12207.pt +3 -0
  15. resid_post_layer_21/trainer_0/checkpoints/ae_386.pt +3 -0
  16. resid_post_layer_21/trainer_0/checkpoints/ae_3860.pt +3 -0
  17. resid_post_layer_21/trainer_0/checkpoints/ae_38601.pt +3 -0
  18. resid_post_layer_21/trainer_0/config.json +53 -0
  19. resid_post_layer_21/trainer_0/eval_results.json +1 -0
  20. resid_post_layer_21/trainer_1/ae.pt +3 -0
  21. resid_post_layer_21/trainer_1/checkpoints/ae_0.pt +3 -0
  22. resid_post_layer_21/trainer_1/checkpoints/ae_122.pt +3 -0
  23. resid_post_layer_21/trainer_1/checkpoints/ae_1220.pt +3 -0
  24. resid_post_layer_21/trainer_1/checkpoints/ae_12207.pt +3 -0
  25. resid_post_layer_21/trainer_1/checkpoints/ae_3860.pt +3 -0
  26. resid_post_layer_21/trainer_1/checkpoints/ae_38601.pt +3 -0
  27. resid_post_layer_21/trainer_1/config.json +53 -0
  28. resid_post_layer_21/trainer_1/eval_results.json +1 -0
  29. resid_post_layer_21/trainer_2/ae.pt +3 -0
  30. resid_post_layer_21/trainer_2/checkpoints/ae_386.pt +3 -0
  31. resid_post_layer_21/trainer_2/config.json +53 -0
  32. resid_post_layer_21/trainer_2/eval_results.json +1 -0
  33. resid_post_layer_21/trainer_3/ae.pt +3 -0
  34. resid_post_layer_21/trainer_3/checkpoints/ae_0.pt +3 -0
  35. resid_post_layer_21/trainer_3/checkpoints/ae_12207.pt +3 -0
  36. resid_post_layer_21/trainer_3/checkpoints/ae_386.pt +3 -0
  37. resid_post_layer_21/trainer_3/config.json +53 -0
  38. resid_post_layer_21/trainer_3/eval_results.json +1 -0
  39. resid_post_layer_7/trainer_0/ae.pt +3 -0
  40. resid_post_layer_7/trainer_0/config.json +53 -0
  41. resid_post_layer_7/trainer_0/eval_results.json +1 -0
  42. resid_post_layer_7/trainer_1/ae.pt +3 -0
  43. resid_post_layer_7/trainer_1/config.json +53 -0
  44. resid_post_layer_7/trainer_1/eval_results.json +1 -0
  45. resid_post_layer_7/trainer_2/ae.pt +3 -0
  46. resid_post_layer_7/trainer_2/config.json +53 -0
  47. resid_post_layer_7/trainer_2/eval_results.json +1 -0
  48. resid_post_layer_7/trainer_3/ae.pt +3 -0
  49. resid_post_layer_7/trainer_3/config.json +53 -0
  50. resid_post_layer_7/trainer_3/eval_results.json +1 -0
resid_post_layer_14/trainer_0/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e972e49426cc4ad5b47e819305b259747544401e9eab036c195e5925a037589
3
+ size 335622413
resid_post_layer_14/trainer_0/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 16384,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 512,
32
+ 1024,
33
+ 2048,
34
+ 4096,
35
+ 8704
36
+ ],
37
+ "k": 80,
38
+ "device": "cuda:0",
39
+ "layer": 14,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_14_trainer_0",
42
+ "submodule_name": "resid_post_layer_14"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_14/trainer_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 33.73863636363637, "l1_loss": 367.6060606060606, "l0": 86.32287019671816, "frac_variance_explained": 0.7295217803030303, "cossim": 0.94140625, "l2_ratio": 0.949514678030303, "relative_reconstruction_bias": 1.01171875, "frac_alive": 0.99676513671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
resid_post_layer_14/trainer_1/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 16384,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 512,
32
+ 1024,
33
+ 2048,
34
+ 4096,
35
+ 8704
36
+ ],
37
+ "k": 160,
38
+ "device": "cuda:0",
39
+ "layer": 14,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_14_trainer_1",
42
+ "submodule_name": "resid_post_layer_14"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_14/trainer_1/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 30.748106060606062, "l1_loss": 656.2424242424242, "l0": 178.7910262599136, "frac_variance_explained": 0.775094696969697, "cossim": 0.952829071969697, "l2_ratio": 0.9609375, "relative_reconstruction_bias": 1.011600378787879, "frac_alive": 0.99554443359375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
resid_post_layer_14/trainer_2/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 65536,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 2048,
32
+ 4096,
33
+ 8192,
34
+ 16384,
35
+ 34816
36
+ ],
37
+ "k": 80,
38
+ "device": "cuda:0",
39
+ "layer": 14,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_14_trainer_2",
42
+ "submodule_name": "resid_post_layer_14"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_14/trainer_2/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 31.926136363636363, "l1_loss": 346.90909090909093, "l0": 87.18260990489613, "frac_variance_explained": 0.7577533143939394, "cossim": 0.9484493371212122, "l2_ratio": 0.9568536931818182, "relative_reconstruction_bias": 1.0106534090909092, "frac_alive": 0.83673095703125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
resid_post_layer_14/trainer_3/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 65536,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 2048,
32
+ 4096,
33
+ 8192,
34
+ 16384,
35
+ 34816
36
+ ],
37
+ "k": 160,
38
+ "device": "cuda:0",
39
+ "layer": 14,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_14_trainer_3",
42
+ "submodule_name": "resid_post_layer_14"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_14/trainer_3/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 28.75, "l1_loss": 598.8484848484849, "l0": 179.98043915719697, "frac_variance_explained": 0.8037405303030303, "cossim": 0.95703125, "l2_ratio": 0.96484375, "relative_reconstruction_bias": 1.0095880681818181, "frac_alive": 0.770263671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
resid_post_layer_21/trainer_0/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:165e90ed2f6253da543c03a24825726264d22e8ad92176ec4ad4dbdb3945270a
3
+ size 335622413
resid_post_layer_21/trainer_0/checkpoints/ae_0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5c244af1bb6fdc5168d5a0fba14e47f0d7966ef4d7cec1675ee697750f66f27
3
+ size 335622563
resid_post_layer_21/trainer_0/checkpoints/ae_122.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:945e387769f7beda9be7b0c9cdd696b2e2abefecece8392a1b4c2e80084442bf
3
+ size 335622585
resid_post_layer_21/trainer_0/checkpoints/ae_1220.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39e6bc706f6b8e964840095580947f17a6bcede000021a3c33cf2aa3cee1936e
3
+ size 335622596
resid_post_layer_21/trainer_0/checkpoints/ae_12207.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5c625417e79276c56a1f143346a90a28a0435662bcb997db3b56bebbc398265
3
+ size 335622863
resid_post_layer_21/trainer_0/checkpoints/ae_386.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7a2b19e91d0bd6e2289554404ee0fb86d26fc506de8b4e8a2ec3a3e7d79c9f1
3
+ size 335622585
resid_post_layer_21/trainer_0/checkpoints/ae_3860.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:391cd1f61d02cec575cfbdef45413d669a62933edec176af68033e87692dd893
3
+ size 335622596
resid_post_layer_21/trainer_0/checkpoints/ae_38601.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8b619b9fd567aedb2aa09fb46afcf1a641e6220be5d2deb316ebe807ddf1dfd
3
+ size 335622863
resid_post_layer_21/trainer_0/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 16384,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 512,
32
+ 1024,
33
+ 2048,
34
+ 4096,
35
+ 8704
36
+ ],
37
+ "k": 80,
38
+ "device": "cuda:0",
39
+ "layer": 21,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_21_trainer_0",
42
+ "submodule_name": "resid_post_layer_21"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_21/trainer_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 29.035984848484848, "l1_loss": 389.1212121212121, "l0": 86.1897815357555, "frac_variance_explained": 0.8267045454545454, "cossim": 0.9356060606060606, "l2_ratio": 0.9453125, "relative_reconstruction_bias": 1.0080492424242424, "frac_alive": 0.8565673828125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
resid_post_layer_21/trainer_1/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10a23c9b3a00ddd46fb78590c3d00ea4c61ef652f192a73f741b9c590251d5b1
3
+ size 335622413
resid_post_layer_21/trainer_1/checkpoints/ae_0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:691f9ddf883175ebdcf92281a5788e1d56f11403bfd15152a0d8fa77ecf2ab1c
3
+ size 335622563
resid_post_layer_21/trainer_1/checkpoints/ae_122.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:249fb4fd697ed16d6fc7899de500c9ffedc26d111999ec7cb55a1bd4f7a6967c
3
+ size 335622585
resid_post_layer_21/trainer_1/checkpoints/ae_1220.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c82d644ef2719a67149ff1a159f8ad31044dcbcd85b7f6cb541e56bfc5d2d2b8
3
+ size 335622596
resid_post_layer_21/trainer_1/checkpoints/ae_12207.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3c433a364c0e7c8971a486bd3128c869bb4b2195dbb95fea4d536b0a73dc324
3
+ size 335622863
resid_post_layer_21/trainer_1/checkpoints/ae_3860.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1120734e71d54fb0babac9283086f3f8b898adfb9944a023fb75017d453b4190
3
+ size 335622596
resid_post_layer_21/trainer_1/checkpoints/ae_38601.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b816102b2ae8f3d93b3f98fb1bf0f5c40c9b80c838df83c1c0df35492b2ebc
3
+ size 335622863
resid_post_layer_21/trainer_1/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 16384,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 512,
32
+ 1024,
33
+ 2048,
34
+ 4096,
35
+ 8704
36
+ ],
37
+ "k": 160,
38
+ "device": "cuda:0",
39
+ "layer": 21,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_21_trainer_1",
42
+ "submodule_name": "resid_post_layer_21"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_21/trainer_1/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 25.890151515151516, "l1_loss": 713.7575757575758, "l0": 172.76931184710878, "frac_variance_explained": 0.8618016098484849, "cossim": 0.94921875, "l2_ratio": 0.95703125, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.82403564453125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
resid_post_layer_21/trainer_2/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dd3747bb0722cf29b80d44490af87fcebe5b400c9d4e855a9c67eb3b4bf9888
3
+ size 1342451981
resid_post_layer_21/trainer_2/checkpoints/ae_386.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93c79e12e7bc1bccdb7033bc45f1e92f6f3f607c86ac462002106c8318347b7f
3
+ size 1342452153
resid_post_layer_21/trainer_2/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 65536,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 2048,
32
+ 4096,
33
+ 8192,
34
+ 16384,
35
+ 34816
36
+ ],
37
+ "k": 80,
38
+ "device": "cuda:0",
39
+ "layer": 21,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_21_trainer_2",
42
+ "submodule_name": "resid_post_layer_21"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_21/trainer_2/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 27.90340909090909, "l1_loss": 354.72727272727275, "l0": 86.3529703544848, "frac_variance_explained": 0.8391335227272727, "cossim": 0.94140625, "l2_ratio": 0.9491595643939394, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.7160797119140625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
resid_post_layer_21/trainer_3/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0823518391306a0c4b7854174b77165f825a8c4bc0244b554a4b56df0395dd06
3
+ size 1342451981
resid_post_layer_21/trainer_3/checkpoints/ae_0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:125f43f916a01a1c9fd3ae8bd241c185076396658210163cfa52396424584ccd
3
+ size 1342452131
resid_post_layer_21/trainer_3/checkpoints/ae_12207.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94eadcff4ef5be95da7aae55a5c422da6108b63161e37bd828159e2c12b6cc3c
3
+ size 1342452431
resid_post_layer_21/trainer_3/checkpoints/ae_386.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fad19132d5b66d0c5359ee4a5bcf115726d4a4775034c346c428a8b03dc2a074
3
+ size 1342452153
resid_post_layer_21/trainer_3/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 65536,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 2048,
32
+ 4096,
33
+ 8192,
34
+ 16384,
35
+ 34816
36
+ ],
37
+ "k": 160,
38
+ "device": "cuda:0",
39
+ "layer": 21,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_21_trainer_3",
42
+ "submodule_name": "resid_post_layer_21"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_21/trainer_3/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 24.477272727272727, "l1_loss": 605.030303030303, "l0": 172.87360659512606, "frac_variance_explained": 0.8756510416666666, "cossim": 0.9533617424242424, "l2_ratio": 0.9609375, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.713165283203125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
resid_post_layer_7/trainer_0/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33d9a3668cb511c09f22665184e6558d6e8504501d3f4f12b2e96486a51cac5f
3
+ size 335622413
resid_post_layer_7/trainer_0/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 16384,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 512,
32
+ 1024,
33
+ 2048,
34
+ 4096,
35
+ 8704
36
+ ],
37
+ "k": 80,
38
+ "device": "cuda:0",
39
+ "layer": 7,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_7_trainer_0",
42
+ "submodule_name": "resid_post_layer_7"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_7/trainer_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 24.009469696969695, "l1_loss": 307.54545454545456, "l0": 92.451904296875, "frac_variance_explained": 0.7856889204545454, "cossim": 0.9562618371212122, "l2_ratio": 0.96484375, "relative_reconstruction_bias": 1.007930871212121, "frac_alive": 0.98370361328125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
resid_post_layer_7/trainer_1/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21984cbf97693658675bddcdaec65426aa20d49bd7926914339f13f5e53d2827
3
+ size 335622413
resid_post_layer_7/trainer_1/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 16384,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 512,
32
+ 1024,
33
+ 2048,
34
+ 4096,
35
+ 8704
36
+ ],
37
+ "k": 160,
38
+ "device": "cuda:0",
39
+ "layer": 7,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_7_trainer_1",
42
+ "submodule_name": "resid_post_layer_7"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_7/trainer_1/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 21.954545454545453, "l1_loss": 509.8181818181818, "l0": 185.27754905007103, "frac_variance_explained": 0.8208451704545454, "cossim": 0.9637784090909091, "l2_ratio": 0.97265625, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.9376220703125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
resid_post_layer_7/trainer_2/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89b5ea3fe385ce68e9e80924aa94b1f17f248bf0f5aa76e58c491fdd5d82a289
3
+ size 1342451981
resid_post_layer_7/trainer_2/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 65536,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 2048,
32
+ 4096,
33
+ 8192,
34
+ 16384,
35
+ 34816
36
+ ],
37
+ "k": 80,
38
+ "device": "cuda:0",
39
+ "layer": 7,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_7_trainer_2",
42
+ "submodule_name": "resid_post_layer_7"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_7/trainer_2/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 22.15530303030303, "l1_loss": 281.45454545454544, "l0": 93.40012440536961, "frac_variance_explained": 0.8163470643939394, "cossim": 0.9609375, "l2_ratio": 0.96875, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.830413818359375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
resid_post_layer_7/trainer_3/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd7c4889199febcdc335303bd392d5e416345715f6535f462bbd85db7ef69366
3
+ size 1342451981
resid_post_layer_7/trainer_3/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "MatryoshkaBatchTopKTrainer",
4
+ "dict_class": "MatryoshkaBatchTopKSAE",
5
+ "lr": 5e-05,
6
+ "steps": 122070,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 97656,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1280,
13
+ "seed": 0,
14
+ "activation_dim": 2560,
15
+ "dict_size": 65536,
16
+ "group_fractions": [
17
+ 0.03125,
18
+ 0.0625,
19
+ 0.125,
20
+ 0.25,
21
+ 0.53125
22
+ ],
23
+ "group_weights": [
24
+ 0.2,
25
+ 0.2,
26
+ 0.2,
27
+ 0.2,
28
+ 0.2
29
+ ],
30
+ "group_sizes": [
31
+ 2048,
32
+ 4096,
33
+ 8192,
34
+ 16384,
35
+ 34816
36
+ ],
37
+ "k": 160,
38
+ "device": "cuda:0",
39
+ "layer": 7,
40
+ "lm_name": "google/gemma-4-E4B",
41
+ "wandb_name": "MatryoshkaBatchTopKTrainer-google/gemma-4-E4B-resid_post_layer_7_trainer_3",
42
+ "submodule_name": "resid_post_layer_7"
43
+ },
44
+ "buffer": {
45
+ "d_submodule": 2560,
46
+ "io": "out",
47
+ "n_ctxs": 122,
48
+ "ctx_len": 2048,
49
+ "refresh_batch_size": 16,
50
+ "out_batch_size": 2048,
51
+ "device": "cuda:0"
52
+ }
53
+ }
resid_post_layer_7/trainer_3/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 19.914772727272727, "l1_loss": 427.3939393939394, "l0": 188.28258190733013, "frac_variance_explained": 0.8515625, "cossim": 0.96875, "l2_ratio": 0.9765625, "relative_reconstruction_bias": 1.0078125, "frac_alive": 0.8144989013671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}