AwesomeInterpretability commited on
Commit
f72c6b2
·
verified ·
1 Parent(s): ebdaa38

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/ae.pt +3 -0
  2. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/config.json +29 -0
  3. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/eval_results.json +1 -0
  4. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/ae.pt +3 -0
  5. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/config.json +29 -0
  6. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/eval_results.json +1 -0
  7. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/ae.pt +3 -0
  8. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/config.json +31 -0
  9. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/eval_results.json +1 -0
  10. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/ae.pt +3 -0
  11. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/config.json +31 -0
  12. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/eval_results.json +1 -0
  13. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/ae.pt +3 -0
  14. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/config.json +32 -0
  15. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/eval_results.json +1 -0
  16. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/ae.pt +3 -0
  17. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/config.json +32 -0
  18. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/eval_results.json +1 -0
  19. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/ae.pt +3 -0
  20. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/config.json +32 -0
  21. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/eval_results.json +1 -0
  22. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/ae.pt +3 -0
  23. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/config.json +32 -0
  24. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/eval_results.json +1 -0
  25. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/ae.pt +3 -0
  26. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/config.json +32 -0
  27. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/eval_results.json +1 -0
  28. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/ae.pt +3 -0
  29. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/config.json +32 -0
  30. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/eval_results.json +1 -0
  31. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/ae.pt +3 -0
  32. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/config.json +29 -0
  33. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/eval_results.json +1 -0
  34. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/ae.pt +3 -0
  35. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/config.json +29 -0
  36. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/eval_results.json +1 -0
  37. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/ae.pt +3 -0
  38. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/config.json +29 -0
  39. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/eval_results.json +1 -0
  40. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/ae.pt +3 -0
  41. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/config.json +29 -0
  42. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/eval_results.json +1 -0
  43. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/ae.pt +3 -0
  44. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/config.json +31 -0
  45. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/eval_results.json +1 -0
  46. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/ae.pt +3 -0
  47. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/config.json +31 -0
  48. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/eval_results.json +1 -0
  49. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/ae.pt +3 -0
  50. saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/config.json +31 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a1d8f2dec81cf9df336c0067b2c8ac2b1a781aaa7dc7b3c2c252a6c58aabb39
3
+ size 268510888
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainerAprilUpdate",
5
+ "activation_dim": 2048,
6
+ "dict_size": 16384,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.012,
9
+ "warmup_steps": 1000,
10
+ "sparsity_warmup_steps": 5000,
11
+ "steps": 244140,
12
+ "decay_start": 195312,
13
+ "seed": 3407,
14
+ "device": "cuda:0",
15
+ "layer": 17,
16
+ "lm_name": "Qwen/Qwen2.5-3B",
17
+ "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_0",
18
+ "submodule_name": "resid_post_layer_17"
19
+ },
20
+ "buffer": {
21
+ "d_submodule": 2048,
22
+ "io": "out",
23
+ "n_ctxs": 122,
24
+ "ctx_len": 2048,
25
+ "refresh_batch_size": 4,
26
+ "out_batch_size": 2048,
27
+ "device": "cuda:0"
28
+ }
29
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 30.879375, "l1_loss": 661.24, "l0": 1127.2820458984374, "frac_variance_explained": -0.4024609375, "cossim": 0.94140625, "l2_ratio": 0.92974609375, "relative_reconstruction_bias": 2.2902734375, "frac_alive": 0.856201171875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11f5775c39495e593118cf42fe9aceaf000b7fa718914aed1924ecfe7ceb82b2
3
+ size 268510888
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainerAprilUpdate",
5
+ "activation_dim": 2048,
6
+ "dict_size": 16384,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.015,
9
+ "warmup_steps": 1000,
10
+ "sparsity_warmup_steps": 5000,
11
+ "steps": 244140,
12
+ "decay_start": 195312,
13
+ "seed": 3407,
14
+ "device": "cuda:0",
15
+ "layer": 17,
16
+ "lm_name": "Qwen/Qwen2.5-3B",
17
+ "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_1",
18
+ "submodule_name": "resid_post_layer_17"
19
+ },
20
+ "buffer": {
21
+ "d_submodule": 2048,
22
+ "io": "out",
23
+ "n_ctxs": 122,
24
+ "ctx_len": 2048,
25
+ "refresh_batch_size": 4,
26
+ "out_batch_size": 2048,
27
+ "device": "cuda:0"
28
+ }
29
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 35.52625, "l1_loss": 595.71, "l0": 808.0417797851562, "frac_variance_explained": -1.18908203125, "cossim": 0.92630859375, "l2_ratio": 0.91580078125, "relative_reconstruction_bias": 3.07451171875, "frac_alive": 0.84942626953125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9eba2b75a1a740969391657b3a5833639843ecd4afc78fb94a893c75193e364
3
+ size 268511254
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "TopKTrainer",
4
+ "dict_class": "AutoEncoderTopK",
5
+ "lr": 0.0003,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "seed": 3407,
13
+ "activation_dim": 2048,
14
+ "dict_size": 16384,
15
+ "k": 520,
16
+ "device": "cuda:0",
17
+ "layer": 17,
18
+ "lm_name": "Qwen/Qwen2.5-3B",
19
+ "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_10",
20
+ "submodule_name": "resid_post_layer_17"
21
+ },
22
+ "buffer": {
23
+ "d_submodule": 2048,
24
+ "io": "out",
25
+ "n_ctxs": 122,
26
+ "ctx_len": 2048,
27
+ "refresh_batch_size": 4,
28
+ "out_batch_size": 2048,
29
+ "device": "cuda:0"
30
+ }
31
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 20.95625, "l1_loss": 1669.56, "l0": 520.0, "frac_variance_explained": 0.94826171875, "cossim": 0.95701171875, "l2_ratio": 0.95572265625, "relative_reconstruction_bias": 1.0398046875, "frac_alive": 0.76019287109375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e989ba017d39a5ec42c1f59c08cbb4d0be38963a1f4b428fc3cd69ddb614b07
3
+ size 268511254
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "TopKTrainer",
4
+ "dict_class": "AutoEncoderTopK",
5
+ "lr": 0.0003,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "seed": 3407,
13
+ "activation_dim": 2048,
14
+ "dict_size": 16384,
15
+ "k": 820,
16
+ "device": "cuda:0",
17
+ "layer": 17,
18
+ "lm_name": "Qwen/Qwen2.5-3B",
19
+ "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_11",
20
+ "submodule_name": "resid_post_layer_17"
21
+ },
22
+ "buffer": {
23
+ "d_submodule": 2048,
24
+ "io": "out",
25
+ "n_ctxs": 122,
26
+ "ctx_len": 2048,
27
+ "refresh_batch_size": 4,
28
+ "out_batch_size": 2048,
29
+ "device": "cuda:0"
30
+ }
31
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 15.7865625, "l1_loss": 2385.28, "l0": 820.0, "frac_variance_explained": 0.96828125, "cossim": 0.9765625, "l2_ratio": 0.97267578125, "relative_reconstruction_bias": 1.0452734375, "frac_alive": 0.33575439453125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:694b546c3509ddf883451a3505dc43eed933d17fd2014b184814b191ae590f2f
3
+ size 268511254
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1024,
13
+ "seed": 3407,
14
+ "activation_dim": 2048,
15
+ "dict_size": 16384,
16
+ "k": 50,
17
+ "device": "cuda:0",
18
+ "layer": 17,
19
+ "lm_name": "Qwen/Qwen2.5-3B",
20
+ "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_12",
21
+ "submodule_name": "resid_post_layer_17"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 2048,
25
+ "io": "out",
26
+ "n_ctxs": 122,
27
+ "ctx_len": 2048,
28
+ "refresh_batch_size": 4,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda:0"
31
+ }
32
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 46.85375, "l1_loss": 527.98, "l0": 88.1703369140625, "frac_variance_explained": -3.31318359375, "cossim": 0.87599609375, "l2_ratio": 0.89609375, "relative_reconstruction_bias": 4.184140625, "frac_alive": 0.9842529296875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:066d0e4ee037b48e5a48ed8990e55ce6ae9baf0bda5d425c1c061395e560e524
3
+ size 268511254
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1024,
13
+ "seed": 3407,
14
+ "activation_dim": 2048,
15
+ "dict_size": 16384,
16
+ "k": 80,
17
+ "device": "cuda:0",
18
+ "layer": 17,
19
+ "lm_name": "Qwen/Qwen2.5-3B",
20
+ "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_13",
21
+ "submodule_name": "resid_post_layer_17"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 2048,
25
+ "io": "out",
26
+ "n_ctxs": 122,
27
+ "ctx_len": 2048,
28
+ "refresh_batch_size": 4,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda:0"
31
+ }
32
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 40.933125, "l1_loss": 601.47, "l0": 139.47208984375, "frac_variance_explained": -1.42328125, "cossim": 0.89396484375, "l2_ratio": 0.9131640625, "relative_reconstruction_bias": 2.918828125, "frac_alive": 0.9884033203125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae70d85a1b4695c2b085c534e7dc258ae29a8fb08e5d18d34aa7707d80b1c6f1
3
+ size 268511254
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1024,
13
+ "seed": 3407,
14
+ "activation_dim": 2048,
15
+ "dict_size": 16384,
16
+ "k": 160,
17
+ "device": "cuda:0",
18
+ "layer": 17,
19
+ "lm_name": "Qwen/Qwen2.5-3B",
20
+ "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_14",
21
+ "submodule_name": "resid_post_layer_17"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 2048,
25
+ "io": "out",
26
+ "n_ctxs": 122,
27
+ "ctx_len": 2048,
28
+ "refresh_batch_size": 4,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda:0"
31
+ }
32
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 34.115, "l1_loss": 754.1, "l0": 256.5938452148437, "frac_variance_explained": -0.41078125, "cossim": 0.92021484375, "l2_ratio": 0.93349609375, "relative_reconstruction_bias": 2.233203125, "frac_alive": 0.9779052734375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c539372f8ea9a792322995795b1ce3773c4bae4fbaa65f142c5be8f083f409d
3
+ size 268511254
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1024,
13
+ "seed": 3407,
14
+ "activation_dim": 2048,
15
+ "dict_size": 16384,
16
+ "k": 320,
17
+ "device": "cuda:0",
18
+ "layer": 17,
19
+ "lm_name": "Qwen/Qwen2.5-3B",
20
+ "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_15",
21
+ "submodule_name": "resid_post_layer_17"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 2048,
25
+ "io": "out",
26
+ "n_ctxs": 122,
27
+ "ctx_len": 2048,
28
+ "refresh_batch_size": 4,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda:0"
31
+ }
32
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 26.280625, "l1_loss": 1028.06, "l0": 452.7045849609375, "frac_variance_explained": 0.453125, "cossim": 0.94921875, "l2_ratio": 0.9529296875, "relative_reconstruction_bias": 1.596015625, "frac_alive": 0.79998779296875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee2a19f8efa063e40c3653e33345d62451b3ad047f3e4585bf07ba0db491ee68
3
+ size 268511254
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1024,
13
+ "seed": 3407,
14
+ "activation_dim": 2048,
15
+ "dict_size": 16384,
16
+ "k": 520,
17
+ "device": "cuda:0",
18
+ "layer": 17,
19
+ "lm_name": "Qwen/Qwen2.5-3B",
20
+ "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_16",
21
+ "submodule_name": "resid_post_layer_17"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 2048,
25
+ "io": "out",
26
+ "n_ctxs": 122,
27
+ "ctx_len": 2048,
28
+ "refresh_batch_size": 4,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda:0"
31
+ }
32
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 21.13875, "l1_loss": 1909.0, "l0": 685.9320922851563, "frac_variance_explained": 0.791875, "cossim": 0.961171875, "l2_ratio": 0.9586328125, "relative_reconstruction_bias": 1.278515625, "frac_alive": 0.527099609375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd0757a3307159a6ab182d22337f21511e5888b585a7f4df61ca912a4f14af32
3
+ size 268511254
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0003,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1024,
13
+ "seed": 3407,
14
+ "activation_dim": 2048,
15
+ "dict_size": 16384,
16
+ "k": 820,
17
+ "device": "cuda:0",
18
+ "layer": 17,
19
+ "lm_name": "Qwen/Qwen2.5-3B",
20
+ "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_17",
21
+ "submodule_name": "resid_post_layer_17"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 2048,
25
+ "io": "out",
26
+ "n_ctxs": 122,
27
+ "ctx_len": 2048,
28
+ "refresh_batch_size": 4,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda:0"
31
+ }
32
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 15.2528125, "l1_loss": 2843.04, "l0": 1031.6360986328125, "frac_variance_explained": 0.9468359375, "cossim": 0.9765625, "l2_ratio": 0.968359375, "relative_reconstruction_bias": 1.071953125, "frac_alive": 0.301025390625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9da5396e68d059eea837520a335f7ad87438c4012a2c12281f5f062f589e6cce
3
+ size 268510888
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainerAprilUpdate",
5
+ "activation_dim": 2048,
6
+ "dict_size": 16384,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.02,
9
+ "warmup_steps": 1000,
10
+ "sparsity_warmup_steps": 5000,
11
+ "steps": 244140,
12
+ "decay_start": 195312,
13
+ "seed": 3407,
14
+ "device": "cuda:0",
15
+ "layer": 17,
16
+ "lm_name": "Qwen/Qwen2.5-3B",
17
+ "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_2",
18
+ "submodule_name": "resid_post_layer_17"
19
+ },
20
+ "buffer": {
21
+ "d_submodule": 2048,
22
+ "io": "out",
23
+ "n_ctxs": 122,
24
+ "ctx_len": 2048,
25
+ "refresh_batch_size": 4,
26
+ "out_batch_size": 2048,
27
+ "device": "cuda:0"
28
+ }
29
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 41.816875, "l1_loss": 535.65, "l0": 496.90281494140623, "frac_variance_explained": -2.56572265625, "cossim": 0.90685546875, "l2_ratio": 0.89265625, "relative_reconstruction_bias": 4.47685546875, "frac_alive": 0.84442138671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c436edbd3b036ae240edece29e375e791805c7ef1ab7c7493006d8b4df11031c
3
+ size 268510888
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainerAprilUpdate",
5
+ "activation_dim": 2048,
6
+ "dict_size": 16384,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.03,
9
+ "warmup_steps": 1000,
10
+ "sparsity_warmup_steps": 5000,
11
+ "steps": 244140,
12
+ "decay_start": 195312,
13
+ "seed": 3407,
14
+ "device": "cuda:0",
15
+ "layer": 17,
16
+ "lm_name": "Qwen/Qwen2.5-3B",
17
+ "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_3",
18
+ "submodule_name": "resid_post_layer_17"
19
+ },
20
+ "buffer": {
21
+ "d_submodule": 2048,
22
+ "io": "out",
23
+ "n_ctxs": 122,
24
+ "ctx_len": 2048,
25
+ "refresh_batch_size": 4,
26
+ "out_batch_size": 2048,
27
+ "device": "cuda:0"
28
+ }
29
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 48.4025, "l1_loss": 464.59, "l0": 241.1419287109375, "frac_variance_explained": -4.48306640625, "cossim": 0.8783203125, "l2_ratio": 0.85603515625, "relative_reconstruction_bias": 6.0671484375, "frac_alive": 0.8358154296875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bc90a59829f18b3a66df8130f83d72273199c4ce7c1d6c60e0cf6dd55dae725
3
+ size 268510888
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainerAprilUpdate",
5
+ "activation_dim": 2048,
6
+ "dict_size": 16384,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.04,
9
+ "warmup_steps": 1000,
10
+ "sparsity_warmup_steps": 5000,
11
+ "steps": 244140,
12
+ "decay_start": 195312,
13
+ "seed": 3407,
14
+ "device": "cuda:0",
15
+ "layer": 17,
16
+ "lm_name": "Qwen/Qwen2.5-3B",
17
+ "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_4",
18
+ "submodule_name": "resid_post_layer_17"
19
+ },
20
+ "buffer": {
21
+ "d_submodule": 2048,
22
+ "io": "out",
23
+ "n_ctxs": 122,
24
+ "ctx_len": 2048,
25
+ "refresh_batch_size": 4,
26
+ "out_batch_size": 2048,
27
+ "device": "cuda:0"
28
+ }
29
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 53.45625, "l1_loss": 432.64, "l0": 149.82208251953125, "frac_variance_explained": -6.97576171875, "cossim": 0.85923828125, "l2_ratio": 0.8296484375, "relative_reconstruction_bias": 7.51521484375, "frac_alive": 0.83062744140625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28c5696a2ecfbf2359218fa97544779c1d61d3624e39e003962ad4474d62371f
3
+ size 268510888
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainerAprilUpdate",
5
+ "activation_dim": 2048,
6
+ "dict_size": 16384,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.06,
9
+ "warmup_steps": 1000,
10
+ "sparsity_warmup_steps": 5000,
11
+ "steps": 244140,
12
+ "decay_start": 195312,
13
+ "seed": 3407,
14
+ "device": "cuda:0",
15
+ "layer": 17,
16
+ "lm_name": "Qwen/Qwen2.5-3B",
17
+ "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_5",
18
+ "submodule_name": "resid_post_layer_17"
19
+ },
20
+ "buffer": {
21
+ "d_submodule": 2048,
22
+ "io": "out",
23
+ "n_ctxs": 122,
24
+ "ctx_len": 2048,
25
+ "refresh_batch_size": 4,
26
+ "out_batch_size": 2048,
27
+ "device": "cuda:0"
28
+ }
29
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 60.575, "l1_loss": 426.7925, "l0": 80.94847900390624, "frac_variance_explained": -10.817421875, "cossim": 0.832265625, "l2_ratio": 0.79265625, "relative_reconstruction_bias": 9.3669921875, "frac_alive": 0.823486328125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66be800a8f2d8320361cff355cc1d088d8fed53b850abce16af2f04e3c2be36e
3
+ size 268511254
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "TopKTrainer",
4
+ "dict_class": "AutoEncoderTopK",
5
+ "lr": 0.0003,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "seed": 3407,
13
+ "activation_dim": 2048,
14
+ "dict_size": 16384,
15
+ "k": 50,
16
+ "device": "cuda:0",
17
+ "layer": 17,
18
+ "lm_name": "Qwen/Qwen2.5-3B",
19
+ "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_6",
20
+ "submodule_name": "resid_post_layer_17"
21
+ },
22
+ "buffer": {
23
+ "d_submodule": 2048,
24
+ "io": "out",
25
+ "n_ctxs": 122,
26
+ "ctx_len": 2048,
27
+ "refresh_batch_size": 4,
28
+ "out_batch_size": 2048,
29
+ "device": "cuda:0"
30
+ }
31
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 36.115, "l1_loss": 225.495, "l0": 50.0, "frac_variance_explained": 0.8665625, "cossim": 0.8618359375, "l2_ratio": 0.8702734375, "relative_reconstruction_bias": 1.028359375, "frac_alive": 0.97332763671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2bd0f50e52e30e6408603f136145e7afee6b43b070709be09fbea3469a97169
3
+ size 268511254
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "TopKTrainer",
4
+ "dict_class": "AutoEncoderTopK",
5
+ "lr": 0.0003,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "seed": 3407,
13
+ "activation_dim": 2048,
14
+ "dict_size": 16384,
15
+ "k": 80,
16
+ "device": "cuda:0",
17
+ "layer": 17,
18
+ "lm_name": "Qwen/Qwen2.5-3B",
19
+ "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_7",
20
+ "submodule_name": "resid_post_layer_17"
21
+ },
22
+ "buffer": {
23
+ "d_submodule": 2048,
24
+ "io": "out",
25
+ "n_ctxs": 122,
26
+ "ctx_len": 2048,
27
+ "refresh_batch_size": 4,
28
+ "out_batch_size": 2048,
29
+ "device": "cuda:0"
30
+ }
31
+ }
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 33.834375, "l1_loss": 293.98, "l0": 80.0, "frac_variance_explained": 0.88892578125, "cossim": 0.8798828125, "l2_ratio": 0.88625, "relative_reconstruction_bias": 1.0291015625, "frac_alive": 0.990234375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:233cbd2e4493010b3bc05b126cf4ae32bc159cb29ae12590528dddfbfdd0a9e1
3
+ size 268511254
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "TopKTrainer",
4
+ "dict_class": "AutoEncoderTopK",
5
+ "lr": 0.0003,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "seed": 3407,
13
+ "activation_dim": 2048,
14
+ "dict_size": 16384,
15
+ "k": 160,
16
+ "device": "cuda:0",
17
+ "layer": 17,
18
+ "lm_name": "Qwen/Qwen2.5-3B",
19
+ "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_8",
20
+ "submodule_name": "resid_post_layer_17"
21
+ },
22
+ "buffer": {
23
+ "d_submodule": 2048,
24
+ "io": "out",
25
+ "n_ctxs": 122,
26
+ "ctx_len": 2048,
27
+ "refresh_batch_size": 4,
28
+ "out_batch_size": 2048,
29
+ "device": "cuda:0"
30
+ }
31
+ }