Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_0/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_0/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_0/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_1/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_1/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_1/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_10/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_10/config.json +32 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_10/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_11/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_11/config.json +32 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_11/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_12/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_12/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_12/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_13/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_13/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_13/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_14/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_14/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_14/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_15/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_15/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_15/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_16/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_16/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_16/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_17/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_17/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_17/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_2/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_2/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_2/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_3/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_3/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_3/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_4/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_4/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_4/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_5/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_5/config.json +29 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_5/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_6/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_6/config.json +32 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_6/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_7/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_7/config.json +32 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_7/eval_results.json +1 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_8/ae.pt +3 -0
- trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_8/config.json +32 -0
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b71df4e2751d9da58c880b85e0b708c4a7b3bc84dc51f0aa460db5da4ab8de0
|
| 3 |
+
size 469843624
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_0/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "AutoEncoder",
|
| 4 |
+
"trainer_class": "StandardTrainerAprilUpdate",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.012,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"steps": 244140,
|
| 12 |
+
"decay_start": 195312,
|
| 13 |
+
"seed": 3407,
|
| 14 |
+
"device": "cuda:0",
|
| 15 |
+
"layer": 20,
|
| 16 |
+
"lm_name": "google/gemma-2-9b",
|
| 17 |
+
"wandb_name": "StandardTrainerNew-google/gemma-2-9b-resid_post_layer_20_trainer_0",
|
| 18 |
+
"submodule_name": "resid_post_layer_20"
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_0/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 89.09, "l1_loss": 1378.96, "l0": 1085.1874340820314, "frac_variance_explained": 0.77041015625, "cossim": 0.90994140625, "l2_ratio": 0.884453125, "relative_reconstruction_bias": 0.99240234375, "frac_alive": 0.764892578125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f6dfc6fccec855b0270832531a9092659f1076e701178c16051317d43059e38
|
| 3 |
+
size 469843624
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_1/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "AutoEncoder",
|
| 4 |
+
"trainer_class": "StandardTrainerAprilUpdate",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.015,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"steps": 244140,
|
| 12 |
+
"decay_start": 195312,
|
| 13 |
+
"seed": 3407,
|
| 14 |
+
"device": "cuda:0",
|
| 15 |
+
"layer": 20,
|
| 16 |
+
"lm_name": "google/gemma-2-9b",
|
| 17 |
+
"wandb_name": "StandardTrainerNew-google/gemma-2-9b-resid_post_layer_20_trainer_1",
|
| 18 |
+
"submodule_name": "resid_post_layer_20"
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_1/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 96.165, "l1_loss": 1069.92, "l0": 748.5599877929687, "frac_variance_explained": 0.7313671875, "cossim": 0.89447265625, "l2_ratio": 0.87078125, "relative_reconstruction_bias": 0.9985546875, "frac_alive": 0.76300048828125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_10/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2311b433e9c33d67c2e3b336d3634c49325690fefc0df64638e6358d4f97e55
|
| 3 |
+
size 469843990
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_10/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1792,
|
| 13 |
+
"seed": 3407,
|
| 14 |
+
"activation_dim": 3584,
|
| 15 |
+
"dict_size": 16384,
|
| 16 |
+
"k": 520,
|
| 17 |
+
"device": "cuda:0",
|
| 18 |
+
"layer": 20,
|
| 19 |
+
"lm_name": "google/gemma-2-9b",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-google/gemma-2-9b-resid_post_layer_20_trainer_10",
|
| 21 |
+
"submodule_name": "resid_post_layer_20"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 3584,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 122,
|
| 27 |
+
"ctx_len": 2048,
|
| 28 |
+
"refresh_batch_size": 4,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:0"
|
| 31 |
+
}
|
| 32 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_10/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 77.4025, "l1_loss": 3337.92, "l0": 709.4781958007812, "frac_variance_explained": 0.828125, "cossim": 0.92984375, "l2_ratio": 0.92970703125, "relative_reconstruction_bias": 1.0009375, "frac_alive": 0.85882568359375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_11/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c685f19dc9ea10c04c4bcba75596d4bc709b917f0620688ed6e2a74018f1788a
|
| 3 |
+
size 469843990
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_11/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1792,
|
| 13 |
+
"seed": 3407,
|
| 14 |
+
"activation_dim": 3584,
|
| 15 |
+
"dict_size": 16384,
|
| 16 |
+
"k": 820,
|
| 17 |
+
"device": "cuda:0",
|
| 18 |
+
"layer": 20,
|
| 19 |
+
"lm_name": "google/gemma-2-9b",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-google/gemma-2-9b-resid_post_layer_20_trainer_11",
|
| 21 |
+
"submodule_name": "resid_post_layer_20"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 3584,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 122,
|
| 27 |
+
"ctx_len": 2048,
|
| 28 |
+
"refresh_batch_size": 4,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:0"
|
| 31 |
+
}
|
| 32 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_11/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 66.6125, "l1_loss": 4846.72, "l0": 1101.4039184570313, "frac_variance_explained": 0.87087890625, "cossim": 0.9459375, "l2_ratio": 0.94115234375, "relative_reconstruction_bias": 0.99490234375, "frac_alive": 0.69659423828125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_12/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd356475673932a2cdd2501c9007f13ea394fcd87d1c2e7e6c61b1b18a647512
|
| 3 |
+
size 469909279
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_12/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 20,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_20_trainer_12",
|
| 14 |
+
"submodule_name": "resid_post_layer_20",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 50
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_12/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 113.1225, "l1_loss": 670.74, "l0": 62.107958984375, "frac_variance_explained": 0.63556640625, "cossim": 0.8490625, "l2_ratio": 0.85751953125, "relative_reconstruction_bias": 1.00828125, "frac_alive": 0.794921875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_13/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5105ec73026d976d765b82a928c2b97845f89cf91a77dc14f36a024825a9e4b5
|
| 3 |
+
size 469909279
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_13/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 20,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_20_trainer_13",
|
| 14 |
+
"submodule_name": "resid_post_layer_20",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 80
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_13/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 106.7075, "l1_loss": 850.78, "l0": 105.3335107421875, "frac_variance_explained": 0.67470703125, "cossim": 0.86701171875, "l2_ratio": 0.8744921875, "relative_reconstruction_bias": 1.0080078125, "frac_alive": 0.87646484375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_14/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09a8e5a9e1ab58d0587b48cd9a49fb6b752ddd4fe3c5b6d67164a5f988bb5540
|
| 3 |
+
size 469909279
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_14/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 20,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_20_trainer_14",
|
| 14 |
+
"submodule_name": "resid_post_layer_20",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 160
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_14/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 96.0775, "l1_loss": 1289.84, "l0": 228.18613525390626, "frac_variance_explained": 0.73494140625, "cossim": 0.89296875, "l2_ratio": 0.89806640625, "relative_reconstruction_bias": 1.007421875, "frac_alive": 0.9716796875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_15/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cef0eac303a5fbd346e1f6a37026541a71d41d5e61636dd738ea64d624c2806
|
| 3 |
+
size 469909279
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_15/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 20,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_20_trainer_15",
|
| 14 |
+
"submodule_name": "resid_post_layer_20",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 320
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_15/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 83.7375, "l1_loss": 1982.44, "l0": 475.87285888671875, "frac_variance_explained": 0.79875, "cossim": 0.91837890625, "l2_ratio": 0.92162109375, "relative_reconstruction_bias": 1.0037109375, "frac_alive": 0.9964599609375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_16/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37230e38854bdb9718c7a88373e19fbc1a482e7201231d54e98370f3b0ba3a05
|
| 3 |
+
size 469909279
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_16/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 20,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_20_trainer_16",
|
| 14 |
+
"submodule_name": "resid_post_layer_20",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 520
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_16/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 72.4525, "l1_loss": 2695.04, "l0": 763.3169091796875, "frac_variance_explained": 0.8498046875, "cossim": 0.93865234375, "l2_ratio": 0.93953125, "relative_reconstruction_bias": 1.0021484375, "frac_alive": 0.97613525390625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_17/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46c50e326d6f0e8e8c75085f5c8abbbdfb1d4223ccf7aa5cd6d506f18ba98405
|
| 3 |
+
size 469909279
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_17/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 20,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_20_trainer_17",
|
| 14 |
+
"submodule_name": "resid_post_layer_20",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 820
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_17/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 88.475, "l1_loss": 3319.2, "l0": 1094.02822265625, "frac_variance_explained": 0.82087890625, "cossim": 0.915390625, "l2_ratio": 1.0532421875, "relative_reconstruction_bias": 1.0951953125, "frac_alive": 0.8267822265625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c13d63e813e9b1128c0b4f82b7787f4415f3e5240174d527113c47794c48c216
|
| 3 |
+
size 469843624
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_2/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "AutoEncoder",
|
| 4 |
+
"trainer_class": "StandardTrainerAprilUpdate",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.02,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"steps": 244140,
|
| 12 |
+
"decay_start": 195312,
|
| 13 |
+
"seed": 3407,
|
| 14 |
+
"device": "cuda:0",
|
| 15 |
+
"layer": 20,
|
| 16 |
+
"lm_name": "google/gemma-2-9b",
|
| 17 |
+
"wandb_name": "StandardTrainerNew-google/gemma-2-9b-resid_post_layer_20_trainer_2",
|
| 18 |
+
"submodule_name": "resid_post_layer_20"
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_2/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 103.8625, "l1_loss": 782.32, "l0": 452.529501953125, "frac_variance_explained": 0.68568359375, "cossim": 0.87564453125, "l2_ratio": 0.85255859375, "relative_reconstruction_bias": 1.00189453125, "frac_alive": 0.76043701171875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc6c7741e89ecb1f21ad5fc5d9fa2399190e5ff543b5f1e1d21176b91408b9c1
|
| 3 |
+
size 469843624
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_3/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "AutoEncoder",
|
| 4 |
+
"trainer_class": "StandardTrainerAprilUpdate",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.03,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"steps": 244140,
|
| 12 |
+
"decay_start": 195312,
|
| 13 |
+
"seed": 3407,
|
| 14 |
+
"device": "cuda:0",
|
| 15 |
+
"layer": 20,
|
| 16 |
+
"lm_name": "google/gemma-2-9b",
|
| 17 |
+
"wandb_name": "StandardTrainerNew-google/gemma-2-9b-resid_post_layer_20_trainer_3",
|
| 18 |
+
"submodule_name": "resid_post_layer_20"
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_3/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 112.78, "l1_loss": 524.97, "l0": 226.3606103515625, "frac_variance_explained": 0.6291796875, "cossim": 0.8526171875, "l2_ratio": 0.8326171875, "relative_reconstruction_bias": 1.01302734375, "frac_alive": 0.7562255859375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55839739e1dde5dfb250880c8737bdebe0b3502eca2f4fecd6330e8720a42a1e
|
| 3 |
+
size 469843624
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_4/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "AutoEncoder",
|
| 4 |
+
"trainer_class": "StandardTrainerAprilUpdate",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.04,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"steps": 244140,
|
| 12 |
+
"decay_start": 195312,
|
| 13 |
+
"seed": 3407,
|
| 14 |
+
"device": "cuda:0",
|
| 15 |
+
"layer": 20,
|
| 16 |
+
"lm_name": "google/gemma-2-9b",
|
| 17 |
+
"wandb_name": "StandardTrainerNew-google/gemma-2-9b-resid_post_layer_20_trainer_4",
|
| 18 |
+
"submodule_name": "resid_post_layer_20"
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_4/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 118.235, "l1_loss": 401.73, "l0": 140.89248291015625, "frac_variance_explained": 0.58724609375, "cossim": 0.83705078125, "l2_ratio": 0.8162890625, "relative_reconstruction_bias": 1.0190234375, "frac_alive": 0.75213623046875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aabc742dd6f851593f23b713d949249ef667211e71c7bbb2307600767b87b1e9
|
| 3 |
+
size 469843624
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_5/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "AutoEncoder",
|
| 4 |
+
"trainer_class": "StandardTrainerAprilUpdate",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.06,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"steps": 244140,
|
| 12 |
+
"decay_start": 195312,
|
| 13 |
+
"seed": 3407,
|
| 14 |
+
"device": "cuda:0",
|
| 15 |
+
"layer": 20,
|
| 16 |
+
"lm_name": "google/gemma-2-9b",
|
| 17 |
+
"wandb_name": "StandardTrainerNew-google/gemma-2-9b-resid_post_layer_20_trainer_5",
|
| 18 |
+
"submodule_name": "resid_post_layer_20"
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_5/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 125.69, "l1_loss": 276.76, "l0": 73.5276318359375, "frac_variance_explained": 0.53919921875, "cossim": 0.8143359375, "l2_ratio": 0.79361328125, "relative_reconstruction_bias": 1.0208203125, "frac_alive": 0.74652099609375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_6/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:54d29287a996d1e1b1c8d5ddc34e70791f28ad8ea22a2f5b129adc553e499cd3
|
| 3 |
+
size 469843990
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_6/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1792,
|
| 13 |
+
"seed": 3407,
|
| 14 |
+
"activation_dim": 3584,
|
| 15 |
+
"dict_size": 16384,
|
| 16 |
+
"k": 50,
|
| 17 |
+
"device": "cuda:0",
|
| 18 |
+
"layer": 20,
|
| 19 |
+
"lm_name": "google/gemma-2-9b",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-google/gemma-2-9b-resid_post_layer_20_trainer_6",
|
| 21 |
+
"submodule_name": "resid_post_layer_20"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 3584,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 122,
|
| 27 |
+
"ctx_len": 2048,
|
| 28 |
+
"refresh_batch_size": 4,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:0"
|
| 31 |
+
}
|
| 32 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_6/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 113.03, "l1_loss": 643.72, "l0": 59.92653564453125, "frac_variance_explained": 0.63427734375, "cossim": 0.84900390625, "l2_ratio": 0.85697265625, "relative_reconstruction_bias": 1.0082421875, "frac_alive": 0.93328857421875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_7/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e527bcd86993559a7d07cc2d3a5b9ed83df35ac0f3652b0980566f878f633f0e
|
| 3 |
+
size 469843990
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_7/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1792,
|
| 13 |
+
"seed": 3407,
|
| 14 |
+
"activation_dim": 3584,
|
| 15 |
+
"dict_size": 16384,
|
| 16 |
+
"k": 80,
|
| 17 |
+
"device": "cuda:0",
|
| 18 |
+
"layer": 20,
|
| 19 |
+
"lm_name": "google/gemma-2-9b",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-google/gemma-2-9b-resid_post_layer_20_trainer_7",
|
| 21 |
+
"submodule_name": "resid_post_layer_20"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 3584,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 122,
|
| 27 |
+
"ctx_len": 2048,
|
| 28 |
+
"refresh_batch_size": 4,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:0"
|
| 31 |
+
}
|
| 32 |
+
}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_7/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 106.885, "l1_loss": 847.28, "l0": 101.578388671875, "frac_variance_explained": 0.67421875, "cossim": 0.86607421875, "l2_ratio": 0.87265625, "relative_reconstruction_bias": 1.00671875, "frac_alive": 0.9200439453125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_8/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f648ea19eb580d53fa01f59aea7db3b93e1fbc4a264bae9328fc391a54d380a7
|
| 3 |
+
size 469843990
|
trained_saes___google_gemma-2-9b_batch_top_k_jump_relu_standard_new/resid_post_layer_20/trainer_8/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "BatchTopKTrainer",
|
| 4 |
+
"dict_class": "BatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 1792,
|
| 13 |
+
"seed": 3407,
|
| 14 |
+
"activation_dim": 3584,
|
| 15 |
+
"dict_size": 16384,
|
| 16 |
+
"k": 160,
|
| 17 |
+
"device": "cuda:0",
|
| 18 |
+
"layer": 20,
|
| 19 |
+
"lm_name": "google/gemma-2-9b",
|
| 20 |
+
"wandb_name": "BatchTopKTrainer-google/gemma-2-9b-resid_post_layer_20_trainer_8",
|
| 21 |
+
"submodule_name": "resid_post_layer_20"
|
| 22 |
+
},
|
| 23 |
+
"buffer": {
|
| 24 |
+
"d_submodule": 3584,
|
| 25 |
+
"io": "out",
|
| 26 |
+
"n_ctxs": 122,
|
| 27 |
+
"ctx_len": 2048,
|
| 28 |
+
"refresh_batch_size": 4,
|
| 29 |
+
"out_batch_size": 2048,
|
| 30 |
+
"device": "cuda:0"
|
| 31 |
+
}
|
| 32 |
+
}
|